1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-22 09:42:02 +01:00

Compare commits

...

229 Commits

Author SHA1 Message Date
481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
39c0815d9e WriteDiscard 2023-03-21 08:57:29 -04:00
a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
796abfad80 Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT
Disable diagnostic pragma warnings for CUDA 12+
2023-01-17 09:34:49 -05:00
ad0270ac8c fix: diagnostic pragma warnings fixed for CUDA 12+ 2023-01-12 12:36:30 +00:00
4ca1bf7cca Added gauge invariance test 2022-12-21 07:23:16 -05:00
2ff868f7a5 CPU open doesn't need to free space 2022-12-20 05:10:23 -05:00
ede02b6883 Memory manager debug Felix case 2022-12-20 05:10:23 -05:00
1822ced302 Bug fix 2022-12-20 05:10:23 -05:00
37ba32776f More logging 2022-12-20 05:10:23 -05:00
99b3697b03 More loggin 2022-12-20 05:10:23 -05:00
43a45ec97b SSC_START 2022-12-20 05:10:23 -05:00
b00a4142e5 A=A fix 2022-12-20 05:10:23 -05:00
3791bc527b Logging pulled in from dirichlet branch 2022-12-20 05:10:23 -05:00
d8c29f5fcf Updated FFT test for PETSc 2022-12-18 12:05:00 -05:00
281f8101fe Matt FFT test 2022-12-17 20:35:33 -05:00
07acfe89f2 Merge pull request #417 from rrhodgson/feature/fermtoprop
Feature/fermtoprop
2022-12-06 12:45:03 -05:00
40234f531f FermToProp accelerator_for -> thread_for 2022-12-06 17:34:51 +00:00
d49694f38f PropToFerm fix 2022-12-06 15:48:54 +00:00
97a098636d FermToProp 2022-11-30 15:36:35 -05:00
e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
1873101362 PVC 2022-11-08 13:22:45 -08:00
63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
913fbca74a Merge pull request #410 from gkanwar/photon_and_sha_patches
Photon.h and SHA256 patches
2022-08-31 18:01:45 -04:00
60dfb49afa Remove FP16 tests when FP16 is disabled 2022-08-21 17:29:55 +02:00
554c238359 Update OpenSSL digest to use high-level methods
This avoids deprecation warnings when compiling against OpenSSL 3.0
but should still be backwards compatible. It is the recommended way
to use the digest API going forward.
2022-08-21 17:28:57 +02:00
f922adf05e Fix Photon ComplexField type 2022-08-21 16:16:18 +02:00
188d2c7a4d PVC default, ignore ATS 2022-08-02 08:38:53 -07:00
17d7177105 Files for SYCL 2022-08-02 08:33:39 -07:00
bb0a0da47a inon blocking caution due to SYCL 2022-08-02 08:09:43 -07:00
84110166e4 Fix the fence 2022-08-02 08:00:43 -07:00
d32b923b6c Fencing on a stream in SYCL is needed. Didn't know that ... gulp 2022-08-02 07:58:04 -07:00
2ab1af5754 Ensure no synchronize and not optoin dependent 2022-07-19 09:51:06 -07:00
5f8892bf03 Mistake pointed out by Camilo 2022-07-19 09:31:51 -07:00
f14e7e51e7 Grid accelerator 2022-07-12 10:56:22 -07:00
042ab1a052 Update GridStd.h 2022-06-27 13:21:39 -04:00
2df98a99bc Merge pull request #406 from giordano/patch-1
Update default value of gen-simd-width in README
2022-06-14 17:46:25 -04:00
315ea18be2 Update default value of gen-simd-width in README 2022-06-14 22:41:05 +01:00
a9c2e1df03 Merge pull request #404 from rrhodgson/feature/json_nvcc
Feature/json nvcc
2022-05-25 13:30:11 -04:00
da4daea57a Updated json to latest release 3.10.5 2022-05-24 16:16:06 +01:00
af3b065add Merge pull request #403 from fjosw/fix/cuda_11_5_warnings
Fixed nvcc 11.5+ warnings
2022-05-24 11:10:02 -04:00
e346154c5d Updated json CUDA compile guards 2022-05-24 15:48:01 +01:00
7937ac2bab fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc 2022-05-24 15:31:03 +01:00
e909aeedf0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h 2022-05-24 15:29:42 +01:00
bab8aa8eb0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT
standard in DisableWarnings.h
2022-05-24 15:27:40 +01:00
38b22f05be Merge pull request #402 from fjosw/fix/clover_warnings
fixed clover warnings
2022-05-24 10:05:27 -04:00
3ca0de1c40 Fix json write for vector<string> 2022-05-24 14:37:33 +01:00
c7205d2a73 Removed nvcc guards for json 2022-05-24 14:30:26 +01:00
617c5362c1 fix: fixed warning: missing return statement at end of non-void function
in CloverHelpers
2022-05-24 11:37:33 +01:00
083b58e66d Merge pull request #401 from JPRichings/LocalCoheranceDeflation
Local coherance batch deflation
2022-05-20 11:44:22 -04:00
633427a2df Merge pull request #400 from JPRichings/wilson_sweep
bench wilson sweep fix
2022-05-20 11:43:40 -04:00
2031d6910a Merge branch 'paboyle:develop' into wilson_sweep 2022-05-20 16:20:23 +01:00
79e34b3eb4 Local Coherence batch deflation 2022-05-19 14:53:17 +01:00
4f3d581ab4 Merge branch 'paboyle:develop' into LocalCoheranceDeflation 2022-05-19 14:46:17 +01:00
d16427b837 Merge pull request #399 from fjosw/fix/Nc_neq_3
fix: assert for dimensions of compact Wilson clover moved to constructor
2022-05-17 09:03:42 -04:00
4b1997e2f3 wilson sweep test 2022-05-16 15:58:33 +01:00
8939d5dc73 bugfix: eo operator called in correct location 2022-05-16 00:28:28 +01:00
b051e00de0 Additional Local Coherance Deflation operator() 2022-05-16 00:25:13 +01:00
8aa75b492f Merge branch 'develop' into fix/Nc_neq_3 2022-05-10 14:22:03 +01:00
0274f40686 Merge pull request #389 from mbruno46/mbruno-eclover
Feature/expClover
2022-05-10 09:18:19 -04:00
77aa147ce5 Merge branch 'develop' into mbruno-eclover 2022-05-10 09:16:53 -04:00
32facbd02a fix: assert for dimensions of compact Wilson clover moved to
constructor.
2022-05-10 10:53:22 +01:00
4de50ab146 Merge pull request #396 from fjosw/fix/readd_config.h
fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am
2022-05-09 08:26:48 -04:00
8b12a61097 fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am 2022-05-09 11:53:22 +01:00
79ea027c0b Merge pull request #377 from RJHudspith/develop
NERSC and ILDG for non-SU(3) configuration checkpoints
2022-05-03 08:55:48 -04:00
62339d437f Merge pull request #387 from lehner/feature/gpt
Parity mass terms for domain wall fermions to enable 4d eofa
2022-05-03 08:52:18 -04:00
698e745276 Merge pull request #390 from fjosw/feature/conserved_current_wilson
Conserved current for wilson fermions
2022-05-03 08:51:10 -04:00
9a6e2c315d Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge
SteepestDescentGaugeFix now exits when the algorithm does not converge.
2022-05-03 08:49:26 -04:00
e61fed87db SteepestDescentGaugeFix now exits when the algorithm does not converge.
This behaviour can be altered by setting err_on_no_converge to false.
2022-04-20 15:41:55 +01:00
b8bc560b51 Test_wilson_conserved_current implemented, all 5d references removed. 2022-04-05 17:33:45 +01:00
6bc2483d57 Merge branch 'feature/eclover' into feature/conserved_current_wilson 2022-04-05 15:26:49 +01:00
82aecbf4cf Test_wilson_conserved_current added 2022-04-05 15:26:39 +01:00
ee23a76aa0 Merge pull request #2 from fjosw/feature/eclover
Feature/eclover
2022-04-05 13:30:13 +02:00
d7191e5a02 SeqConservedCurrent implemented for Wilson fermions 2022-04-05 11:48:56 +01:00
c8a824425b Error message added if another conserved current than vector is requested for
Wilson type fermions.
2022-04-05 10:58:22 +01:00
f23626a6b8 End scope by additional block in CloverHelpers.h 2022-04-02 16:08:15 +01:00
6577a03d16 Explcitly closed views in Exponentiate_Clover 2022-04-01 18:39:12 +01:00
427c8695fe Change signs and prefactors for conserved current to mimic the 5d
version.
2022-04-01 16:20:21 +01:00
9e82c468ab Multiplication of diagonal mass in exponentiate fixed for gpus 2022-04-01 15:54:43 +01:00
603fd96747 Missing link multiplication added. 2022-04-01 10:58:56 +01:00
fe993c0836 /=2 replaced by *=0.5 2022-03-31 17:08:17 +01:00
cdf31d52c1 GaugeGrid and typo fixed 2022-03-31 17:04:35 +01:00
0542eaf1da First version of conserved current contraction for Wilson type quarks 2022-03-31 17:02:09 +01:00
317bdcf158 nerscio parametrization 2022-03-24 13:10:47 +01:00
9ca2c98882 Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover 2022-03-22 15:31:37 +01:00
605cf401e1 Merge branch 'feature/sumd-npr' into develop 2022-03-16 22:43:12 +00:00
f99c3660d2 Merge branch 'feature/cpu-threaded-smp' into develop 2022-03-16 22:07:54 +00:00
92a83a9eb3 Performance improve for Tesseract 2022-03-16 17:14:36 +00:00
53ae01a34a Merge pull request #1 from fjosw/feature/eclover
Feature/eclover
2022-03-15 15:23:35 +01:00
b615fa0f35 Merge pull request #388 from fjosw/feature/sumd-npr
Feature/sumd npr
2022-03-15 09:05:57 -04:00
76c294a7ba open bc fix 2022-03-08 13:55:16 +01:00
0c0c2b1e20 Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed. 2022-03-08 09:44:51 +00:00
e2fc3a0f04 Merge pull request #28 from paboyle/develop
Sync with Upstream
2022-03-08 09:58:51 +01:00
451e7972fd Reintroduced explicit inversion of the Clover term in case of the
CompactExpClover because of the open boundary O(a) improvement. Changed
the timing output to GridLogDebug
2022-03-07 17:43:33 +00:00
56c089d347 Removed leftover comments 2022-03-07 16:40:20 +00:00
acf740e44d Merge pull request #1 from FelixPGZiegler/feature/eclover
Feature/eclover
2022-03-07 16:25:11 +00:00
182f513404 Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover 2022-03-07 15:22:04 +00:00
d5b2323a57 included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover 2022-03-07 14:44:24 +00:00
bad18d4417 Merge branch 'paboyle:develop' into feature/eclover 2022-03-07 13:54:10 +00:00
d1decee4cc Cleaned up unused variables in Lattice_reduction_gpu.h 2022-03-02 16:54:23 +00:00
d4ae71b880 sum_gpu_large and sum_gpu templates added. 2022-03-02 15:40:18 +00:00
e16fc5b2e4 Threaded intranode comms transfer - ideally between NUMA domains 2022-03-01 11:17:24 -05:00
694306f202 Configure for mac arm 2022-03-01 10:53:44 -05:00
9aac1e6d64 Merge branch 'develop' into feature/sumd-npr 2022-03-01 10:51:38 -05:00
3e882f555d Large / small sumD options 2022-03-01 08:54:45 -05:00
438caab25f generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed. 2022-02-27 18:27:18 +00:00
239e2c1ee6 tests: wilson clover cg tests now include compact variant as well as
exponential wilson clover operators
2022-02-27 18:26:34 +00:00
013dc2ef33 tests: core tests for wilson clover and wilson exp clover including
compact version extended/added
2022-02-27 18:13:47 +00:00
9616811c3d Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2022-02-24 22:03:05 +01:00
8a3002c03b separate left and right masses for CayleyFermion5D 2022-02-24 22:02:56 +01:00
71034f828e attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig 2022-02-23 01:02:27 +01:00
11437930c5 cleaned up definitions of wilsonclover fermions 2022-02-22 10:45:16 +01:00
3d44aa9cb9 cleaned up cloverhelpers; fixed test compact_clover which runs 2022-02-22 01:10:19 +01:00
2851870d70 expClover support via helpers template class 2022-02-22 00:05:43 +01:00
63dbaeefaa Extra barrier prior to finalize just in case it fixes an issue on Tursa 2022-02-16 14:01:43 +00:00
e8c187b323 SyCL happier? 2022-02-15 11:24:38 -05:00
0c1618197f Faster intranode MPI works now 2022-02-15 08:52:07 -05:00
f49d5c2d22 Updated scripts for crusher 2022-02-14 17:55:16 -05:00
a3b022d469 Crusher compile 2022-02-14 15:09:08 -05:00
48772f0976 Merge pull request #384 from jdmaia/hip_launchbounds
Changing thread block order and adding launch_bounds
2022-02-14 11:08:28 -05:00
c322420580 Dont instantiate an Nc=3 and non-GP hardwired code for other implementations 2022-02-14 16:04:08 +00:00
86f4e17928 Changing thread block order and adding launch_bounds 2022-02-07 11:29:37 -06:00
215df671be Merge pull request #382 from DanielRichtmann/feature/compact-clover
Compact Clover Fermions
2022-02-01 21:45:38 -05:00
1b6b12589f Get splitting up into implementation and instantiation files correct 2022-02-02 00:51:11 +01:00
3082ab8252 Check in compact version of wilson clover fermions 2022-02-02 00:50:05 +01:00
add86cd7f4 Abandon ET for clover application, use construct similar to multLink 2022-02-01 23:09:06 +01:00
0b6fd20c54 Enable memory coalescing in clover term generation 2022-02-01 23:09:06 +01:00
e83423fee6 Refactor clover to align with other files and prepare for upcoming changes 2022-02-01 23:09:06 +01:00
b4f8e87982 Have Grid's cli interface understand floats 2022-02-01 23:09:06 +01:00
135808dcfa Less verbose 2021-12-07 16:24:24 -05:00
7f7d06d963 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-12-07 09:06:42 -08:00
2bf3b4d576 Update to reduce memory footpring in benchmark test 2021-12-07 09:02:02 -08:00
0bd83cdbda Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled. 2021-11-28 21:51:03 +01:00
f34d34bd17 2 nodes 2021-11-22 22:27:16 -05:00
e32d5141b4 Updated to make MPI reliable still gives good perf, but MPI will be slow
intranode
2021-11-22 21:46:31 -05:00
6d5277f2d7 Update to Spock 2021-11-22 20:58:02 -05:00
14d82777e0 Best modules for spock 2021-11-22 20:47:16 -05:00
2a4e739513 Enable XGMI copy (need to rename nvlink to cover NVLINK/XGMI/XeLink) 2021-11-22 20:46:09 -05:00
8079dc2a14 Cray MPI not working right yet 2021-11-22 20:45:44 -05:00
6ceb556684 Intranode asynch hipMemCopy 2021-11-22 20:45:12 -05:00
76cde73705 HIP improvements on messaging and intranode hipMemCopyAsynch 2021-11-22 20:44:39 -05:00
cc094366a9 Merge pull request #375 from JPRichings/develop
Lattice object ACCcache probe
2021-11-09 18:19:32 -05:00
41a575ff9b Format edit 2021-11-09 21:56:23 +00:00
12ef413065 fix to deflation.h 2021-11-09 21:20:36 +00:00
829a328451 remove deflation timing 2021-11-09 20:46:57 +00:00
402523c62e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-11-09 12:57:40 +00:00
d7bef70b5c Helper functions to allow probe of cache state of lattice objects. 2021-11-09 12:57:09 +00:00
2ad1811642 Added timing to deflation code. 2021-11-09 12:33:25 +00:00
a65a497bae Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-29 13:01:34 +01:00
b27b12828e reverse previous "fix", missing statement was probably intentional, added a comment to that effect 2021-10-29 13:01:31 +01:00
42d56ea6b6 Verbosity 2021-10-29 02:23:08 +01:00
0b905a72dd Better reduction for GPUs 2021-10-29 02:22:22 +01:00
fe9edf8526 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-29 02:03:27 +01:00
44204c7e06 Extra code 2021-10-29 02:02:56 +01:00
33b3789598 Merge pull request #364 from AndrewYongZhenNing/develop
CayleyFermion5D Conserved current fix
2021-10-27 20:27:20 -04:00
195ab2888d Merge branch 'develop' into develop 2021-10-27 20:26:57 -04:00
85f750d753 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-27 00:28:05 +01:00
a4ce6e42c7 Warning free compile on make all and make tests under nvcc 2021-10-27 00:27:03 +01:00
5398b7e7e3 Max 128 size 2021-10-26 09:16:29 -07:00
fd13a3f2be Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-26 10:45:46 +01:00
c144b32368 deflation timers 2021-10-26 10:37:24 +01:00
ba7e371b90 Warning free compile on Tursa.
Hopefully got all reqd virtual dtors
2021-10-21 19:56:52 +01:00
99e7a5d18a Merge pull request #371 from edbennett/hmc-documentation-update
update documentation for GenericHMCRunner - thanks
2021-10-18 14:36:43 -04:00
f824d99059 update documentation for GenericHMCRunner 2021-10-18 09:50:16 +01:00
749b8022a4 Linear operator and SparseMatrix virtual destructors 2021-10-15 20:47:18 +01:00
7e0057d2c4 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-15 20:46:51 +01:00
cfe9e870d3 Stream 2021-10-15 20:46:44 +01:00
e9c4f06cbf Merge pull request #370 from fjosw/bugfix/gpu_sum_shm
Error Handling sum_Dgpu large objects
2021-10-14 09:12:47 -04:00
1f9688417a Error message added when attempting to sum object which is too large for
the shared memory
2021-10-13 20:45:46 +01:00
16c2a99965 Overlap cudamemcpy - didn't set up stream right 2021-10-11 13:31:26 -07:00
cda915a345 Better options 2021-10-07 20:29:09 +01:00
7c16189e16 Merge pull request #368 from Heinrich-BR/develop
Accelerated Pick-Set Checkerboard functions
2021-10-07 15:13:09 -04:00
ecbfccea43 Merge pull request #369 from paboyle/gauge-group-covariance
expose gauge group in GImpl and generic Nc fix
2021-10-07 15:11:12 -04:00
a8eda8f6da Summit scripts 2021-10-05 21:22:10 -04:00
9b1a0653cf Summit results 2021-10-05 21:22:01 -04:00
7cb1ff7395 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 20:13:42 -04:00
ab6ea29913 Print removal 2021-10-05 20:13:25 -04:00
b5c81a02b6 Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-05 21:13:01 +01:00
d899ee80fc skip record fixed to include norm metadata 2021-10-05 21:12:47 +01:00
4016e705fc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 14:56:57 -04:00
2f4e85e5d6 Summit set up 2021-10-05 14:56:17 -04:00
8ed0b57b09 Memory verbose and tracking, shrink default cache
Print PCI device IDs on node 0
2021-10-05 11:41:03 -04:00
7e130076d6 Fixed line left behind 2021-09-24 17:26:31 +01:00
6efdad6f21 Removed Halo benchmark 2021-09-24 17:18:04 +01:00
a822c48565 Added accelerated pick-set checkerboard functions 2021-09-24 17:13:25 +01:00
014fb76e88 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-09-24 16:45:25 +01:00
30e5311b43 Update from the gods upstream 2021-09-24 16:39:56 +01:00
11ee8a1061 Merge remote-tracking branch 'upstream/develop' into develop 2021-09-02 16:57:42 +01:00
770680669d Whitespace removal. 2021-08-04 09:21:59 +01:00
0cdfc5cf22 Merge remote-tracking branch 'upstream/develop' into develop 2021-07-30 14:40:55 +01:00
428b8ba907 Updated from upstream and added halo benchmark 2021-06-29 01:05:12 +01:00
54c6b1376d Quick fix of conserved current implementation in CayleyFermion5D. Now function treats current insertion with appropriate periodic boundary conditions in the mu=3 direction. 2021-04-21 16:56:46 +01:00
f3f11b586f Tadpole sign now in front of forward hopping term to be consistent with previous implementation and analytic form. 2021-04-17 12:44:27 +01:00
8083e3f7e8 Sign factor for tadpole implementation corrected. 2021-04-15 11:14:31 +01:00
364793154b Reverted checkerboard changes 2021-04-09 15:47:17 +01:00
3e2ae1e9af Added profiling messages to pick and set checkerboard functions 2021-04-08 16:58:47 +01:00
d38ae2fd18 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-06 17:18:39 +01:00
030e7754e4 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-06 17:16:13 +01:00
3b7fce1e76 Reverted checkerboard changes 2021-04-02 14:38:41 +01:00
4d15417f93 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-01 18:28:15 +01:00
ab3c855f65 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-01 18:22:05 +01:00
92e2c517d8 Changed pick- and setCheckerboard to use accelerator_for 2021-04-01 18:21:19 +01:00
172 changed files with 21310 additions and 12200 deletions

View File

@ -44,14 +44,22 @@ directory
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC

View File

@ -16,6 +16,7 @@
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>

View File

@ -14,7 +14,11 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
#endif
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")

View File

@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>

View File

@ -262,7 +262,7 @@ public:
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
@ -358,7 +358,7 @@ public:
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
auto& geom_v = geom;
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
@ -380,7 +380,7 @@ public:
int ptype;
StencilEntry *SE;
for(int point=0;point<geom_v.npoint;point++){
for(int point=0;point<npoint;point++){
SE=Stencil_v.GetEntry(ptype,point,ss);
@ -424,7 +424,7 @@ public:
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
auto& geom_v = geom;
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
@ -454,7 +454,7 @@ public:
int ptype;
StencilEntry *SE;
for(int p=0;p<geom_v.npoint;p++){
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=Stencil_v.GetEntry(ptype,point,ss);

View File

@ -52,6 +52,7 @@ public:
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0;
virtual ~LinearOperatorBase(){};
};
@ -507,7 +508,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
virtual void MpcDag (const Field &in, Field &out){
Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
virtual void MpcDagMpc(const Field &in, Field &out) {
assert(0);// Never need with staggered
}
};
@ -585,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
@ -598,6 +600,7 @@ public:
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;

View File

@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
template<class Field> class Preconditioner : public LinearFunction<Field> {
template<class Field> using Preconditioner = LinearFunction<Field> ;
/*
template<class Field> class Preconditioner : public LinearFunction<Field> {
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field & psi)=0;
};
*/
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
void operator()(const Field &src, Field & psi){
using Preconditioner<Field>::operator();
virtual void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};

View File

@ -48,6 +48,7 @@ public:
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
virtual ~SparseMatrixBase() {};
};
/////////////////////////////////////////////////////////////////////////////////////////////
@ -72,7 +73,7 @@ public:
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual ~CheckerBoardedSparseMatrixBase() {};
};
NAMESPACE_END(Grid);

View File

@ -264,7 +264,7 @@ public:
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});

View File

@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
{
public:
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;

View File

@ -35,7 +35,8 @@ NAMESPACE_BEGIN(Grid);
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;

View File

@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -33,16 +33,19 @@ namespace Grid {
template<class Field>
class ZeroGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
};
template<class Field>
class DoNothingGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { };
};
template<class Field>
class SourceGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = src; };
};
@ -57,6 +60,7 @@ private:
const unsigned int N;
public:
using LinearFunction<Field>::operator();
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
: DeflatedGuesser(_evec, _eval, _evec.size())
@ -87,6 +91,7 @@ private:
const std::vector<RealD> &eval_coarse;
public:
using LinearFunction<FineField>::operator();
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
@ -108,7 +113,43 @@ public:
blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard();
};
};
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
int Nevec = (int)evec_coarse.size();
int Nsrc = (int)src.size();
// make temp variables
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
//Preporcessing
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
guess_coarse[j] = Zero();
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockProject(src_coarse[j],src[j],subspace);
}
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
for (int i=0;i<Nevec;i++)
{
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
const CoarseField & tmp = evec_coarse[i];
for (int j=0;j<Nsrc;j++)
{
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
}
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
};

View File

@ -67,6 +67,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -97,6 +98,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field

View File

@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;

View File

@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
@ -119,7 +119,8 @@ public:
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
ComplexD a, b;
// ComplexD zAz;
RealD zAAz;
ComplexD rq;
@ -146,7 +147,7 @@ public:
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
@ -170,7 +171,7 @@ public:
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
@ -212,7 +213,7 @@ public:
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();

View File

@ -4,106 +4,156 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#define CpuHuge (1)
#define CpuSmall (2)
#define Acc (3)
#define AccHuge (4)
#define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#endif
@ -115,7 +165,6 @@ void MemoryManager::Init(void)
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
@ -127,6 +176,16 @@ void MemoryManager::Init(void)
}
}
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
@ -147,7 +206,9 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif
#ifdef GRID_UVM
@ -179,21 +240,25 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
if (ncache == 0) return ptr;
void * ret = NULL;
int v = -1;
@ -211,6 +276,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
if ( entries[v].valid ) {
ret = entries[v].address;
cacheBytes -= entries[v].bytes;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
@ -219,6 +285,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
cacheBytes += bytes;
return ret;
}
@ -226,23 +293,26 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
cacheBytes -= entries[e].bytes;
return entries[e].address;
}
}

View File

@ -35,6 +35,12 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
#define AUDIT(a) MemoryManager::Audit(FILE_LINE)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
@ -65,6 +71,21 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now
};
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager {
private:
@ -78,21 +99,23 @@ private:
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
static void PrintBytes(void);
static void Audit(std::string s);
static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
@ -112,7 +135,28 @@ private:
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
@ -169,6 +213,8 @@ private:
public:
static void Print(void);
static void PrintAll(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);

View File

@ -3,8 +3,13 @@
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
#define MAXLINE 512
static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
//#define dprintf(...)
////////////////////////////////////////////////////////////
@ -23,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
////////////////////////////////////
// Priority ordering for unlocked entries
@ -104,15 +111,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@ -121,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
// Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
// Cannot be acclocked. If allocated must be in LRU pool.
//
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
// but there is a weakness where CpuLock entries are attempted for erase
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
assert(AccCache.accLock==0); // Cannot evict so logic bomb
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
// EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
@ -150,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@ -165,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
@ -191,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
@ -202,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -212,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
} else {
return;
}
}
}
@ -241,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
(uint64_t)bytes,
(uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
@ -280,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
@ -292,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
LRUremove(AccCache);
}
@ -334,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@ -374,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
// CPU doesn't need to free space
// if (!AccCache.AccPtr) {
// EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
@ -429,20 +460,29 @@ void MemoryManager::NotifyDeletion(void *_ptr)
}
void MemoryManager::Print(void)
{
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
PrintBytes();
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Memory Manager " << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}
void MemoryManager::PrintAll(void)
{
Print();
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
@ -452,13 +492,13 @@ void MemoryManager::Print(void)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
@ -472,6 +512,87 @@ int MemoryManager::isOpen (void* _CpuPtr)
return 0;
}
}
void MemoryManager::Audit(std::string s)
{
uint64_t CpuBytes=0;
uint64_t AccBytes=0;
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
uint64_t LockedBytes=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
CpuBytes+=AccCache.bytes;
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock
<< "\t accLock " << AccCache.accLock
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
}
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
void MemoryManager::PrintState(void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
} else {
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
}
}
NAMESPACE_END(Grid);

View File

@ -12,11 +12,19 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
void MemoryManager::Audit(std::string s){};
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::PrintState(void* CpuPtr)
{
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::PrintAll(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);

View File

@ -388,18 +388,21 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
acceleratorCopySynchronise(); // MPI prob slower
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
// if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
// this->StencilSendToRecvFromComplete(list,dir);
// }
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
// std::cout << "Copy Synchronised\n"<<std::endl;
acceleratorCopySynchronise();
int nreq=list.size();
if (nreq==0) return;

File diff suppressed because it is too large Load Diff

View File

@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@ -88,6 +88,13 @@ public:
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
// Helper function to print the state of this object in the AccCache
void PrintCacheState(void)
{
MemoryManager::PrintState(this->_odata);
}
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
@ -281,8 +288,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
@ -296,8 +303,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});

View File

@ -0,0 +1,55 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_crc.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
{
auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1;
typedef typename vobj::tensor_reduced normtype;
typedef typename normtype::scalar_object scalar;
std::vector<scalar> sff;
sliceSum(ff,sff,mu);
for(int t=0;t<sff.size();t++){
std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
}
}
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
NAMESPACE_BEGIN(Grid);
@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
@ -136,25 +139,50 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
Integer osites = arg.Grid()->oSites();
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
typename vobj::scalar_object ssum;
autoView( arg_v, arg, AcceleratorRead);
ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites);
auto ssum= sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}
@ -210,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, 1,{
auto x_l = left_v[ss];

View File

@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
#ifdef GRID_CUDA
@ -37,14 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
/*
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
*/
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
@ -52,10 +51,14 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
return 0;
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
return 1;
}
template <class sobj, class Iterator>
@ -195,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
@ -204,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
@ -215,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
auto result = buffer_v[0];
return result;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
scalarD *ret_p = (scalarD *)&ret;
const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
if ( ok ) {
ret = sumD_gpu_small(lat,osites);
} else {
ret = sumD_gpu_large(lat,osites);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -227,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,125 @@
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity);
sobj ret ;
Integer nsimd= vobj::Nsimd();
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
return sumD_gpu_tensor(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
});
}
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
{
half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite);
autoView(full_v, full, AcceleratorRead);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++) {
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(half_v[ssh],full_v(ss));
}
});
}
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
{
int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead);
autoView(full_v , full, AcceleratorWrite);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++){
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(full_v[ss],half_v(ssh));
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful
// treatment of scalar-compatible types
@ -218,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
}
}
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -520,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
}
#endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj>

View File

@ -65,29 +65,40 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0);
GridLogError.Active(1);
GridLogWarning.Active(0);
GridLogMessage.Active(1); // at least the messages should be always on
GridLogMemory.Active(0);
GridLogTracing.Active(0);
GridLogIterative.Active(0);
GridLogDebug.Active(0);
GridLogPerformance.Active(0);
GridLogDslash.Active(0);
GridLogIntegrator.Active(1);
GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
}
}

View File

@ -138,7 +138,8 @@ public:
stream << std::setw(log.topWidth);
}
stream << log.topName << log.background()<< " : ";
stream << log.colour() << std::left;
// stream << log.colour() << std::left;
stream << std::left;
if (log.chanWidth > 0)
{
stream << std::setw(log.chanWidth);
@ -153,9 +154,9 @@ public:
stream << log.evidence()
<< now << log.background() << " : " ;
}
stream << log.colour();
// stream << log.colour();
stream << std::right;
stream.flags(f);
return stream;
} else {
return devnull;
@ -180,8 +181,12 @@ extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
extern Colours GridLogColours;
std::string demangle(const char* name) ;

View File

@ -31,6 +31,7 @@ directory
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <map>
#include <pwd.h>
@ -576,6 +577,8 @@ class ScidacReader : public GridLimeReader {
std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
// in principle should do the line below, but that breaks backard compatibility with old data
// skipPastObjectRecord(std::string(GRID_FIELD_NORM));
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return;
}
@ -652,7 +655,8 @@ class IldgWriter : public ScidacWriter {
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
ildgfmt.field = std::string("su3gauge");
const std::string stNC = std::to_string( Nc ) ;
ildgfmt.field = std::string("su"+stNC+"gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
@ -869,7 +873,8 @@ class IldgReader : public GridLimeReader {
} else {
assert(found_ildgFormat);
assert ( ildgFormat_.field == std::string("su3gauge") );
const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
@ -877,7 +882,7 @@ class IldgReader : public GridLimeReader {
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);

View File

@ -6,8 +6,8 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -182,8 +182,8 @@ class GaugeStatistics
public:
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
header.link_trace=WilsonLoops<Impl>::linkTrace(data);
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
header.link_trace = WilsonLoops<Impl>::linkTrace(data);
header.plaquette = WilsonLoops<Impl>::avgPlaquette(data);
}
};
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
assert( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
cm(mu)()(1,1) = adj(cm(mu)()(0,x)) ;
#else
const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#endif
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
@ -317,8 +320,8 @@ template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
@ -330,8 +333,8 @@ template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}

View File

@ -9,6 +9,7 @@
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -30,6 +31,8 @@
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
#include <string>
NAMESPACE_BEGIN(Grid);
using namespace Grid;
@ -145,15 +148,17 @@ public:
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
const int ieee32big = (format == std::string("IEEE32BIG"));
const int ieee32 = (format == std::string("IEEE32"));
const int ieee64big = (format == std::string("IEEE64BIG"));
const int ieee64 = (format == std::string("IEEE64") || \
format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
const std::string stNC = std::to_string( Nc ) ;
if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@ -164,7 +169,7 @@ public:
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
} else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@ -209,27 +214,29 @@ public:
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
writeConfiguration(Umu,file,0,1,ens_label);
writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
int two_row,
int bits32,
std::string ens_label = std::string("DWF"))
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = std::string("UKQCD");
header.sequence_number = sequence_number;
header.ensemble_id = ens_id;
header.ensemble_label = ens_label;
header.hdr_version = "1.0" ;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
@ -243,10 +250,14 @@ public:
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
// Sod it -- always write NcxNc double
header.floating_point = std::string("IEEE64BIG");
const std::string stNC = std::to_string( Nc ) ;
if( two_row ) {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
} else {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
}
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
@ -254,8 +265,15 @@ public:
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
if( two_row ) {
Gauge3x2unmunger<fobj2D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleUnmunger<fobj3D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
}
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
@ -287,8 +305,7 @@ public:
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
@ -328,7 +345,7 @@ public:
GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header);
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);

View File

@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
NAMESPACE_BEGIN(Grid);
GridTimePoint theProgramStart = GridClock::now();
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {

View File

@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
#ifndef __SSC_START
#define __SSC_START
#define __SSC_STOP
#endif
#include <sys/time.h>
#include <ctime>
#include <chrono>
@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
inline uint64_t cyclecount(void){
return 0;
}
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
#define __SSC_STOP __SSC_MARK(0x110)
#define __SSC_START __SSC_MARK(0x111)
#else
#define __SSC_MARK(mark)
#define __SSC_STOP
#define __SSC_START
/*
* cycle counters arch dependent
*/

View File

@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid)
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
struct timeval tv;
#ifdef TIMERS_ON
gettimeofday(&tv,NULL);
#endif
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
typedef std::chrono::system_clock GridClock;
//typedef std::chrono::system_clock GridClock;
typedef std::chrono::high_resolution_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::seconds GridSecs;
@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridUsecs;
typedef std::chrono::microseconds GridTime;
extern GridTimePoint theProgramStart;
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart);
return 1.0*usecs.count();
}
inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
{
stream << time.count()<<" s";

70
Grid/perfmon/Tracing.h Normal file
View File

@ -0,0 +1,70 @@
#pragma once
NAMESPACE_BEGIN(Grid);
#ifdef GRID_TRACING_NVTX
#include <nvToolsExt.h>
class GridTracer {
public:
GridTracer(const char* name) {
nvtxRangePushA(name);
}
~GridTracer() {
nvtxRangePop();
}
};
inline void tracePush(const char *name) { nvtxRangePushA(name); }
inline void tracePop(const char *name) { nvtxRangePop(); }
inline int traceStart(const char *name) { }
inline void traceStop(int ID) { }
#endif
#ifdef GRID_TRACING_ROCTX
#include <roctracer/roctx.h>
class GridTracer {
public:
GridTracer(const char* name) {
roctxRangePushA(name);
std::cout << "roctxRangePush "<<name<<std::endl;
}
~GridTracer() {
roctxRangePop();
std::cout << "roctxRangePop "<<std::endl;
}
};
inline void tracePush(const char *name) { roctxRangePushA(name); }
inline void tracePop(const char *name) { roctxRangePop(); }
inline int traceStart(const char *name) { roctxRangeStart(name); }
inline void traceStop(int ID) { roctxRangeStop(ID); }
#endif
#ifdef GRID_TRACING_TIMER
class GridTracer {
public:
const char *name;
double elapsed;
GridTracer(const char* _name) {
name = _name;
elapsed=-usecond();
}
~GridTracer() {
elapsed+=usecond();
std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
}
};
inline void tracePush(const char *name) { }
inline void tracePop(const char *name) { }
inline int traceStart(const char *name) { return 0; }
inline void traceStop(int ID) { }
#endif
#ifdef GRID_TRACING_NONE
#define GRID_TRACE(name)
inline void tracePush(const char *name) { }
inline void tracePop(const char *name) { }
inline int traceStart(const char *name) { return 0; }
inline void traceStop(int ID) { }
#else
#define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
#endif
NAMESPACE_END(Grid);

View File

@ -16,8 +16,12 @@
#ifdef __NVCC__
#pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#endif
#include "pugixml.h"

View File

@ -451,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//template <class Prop, class Ferm>
#define FAST_FERM_TO_PROP
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuWrite);
autoView(f_v,f,CpuRead);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@ -465,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
}
pokeSpin(p, pjs, j, s);
}
#endif
}
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuRead);
autoView(f_v,f,CpuWrite);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@ -482,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
}
pokeSpin(f, fj, j);
}
#endif
}
//////////////////////////////////////////////

View File

@ -68,9 +68,16 @@ public:
///////////////////////////////////////////////////////////////
// Support for MADWF tricks
///////////////////////////////////////////////////////////////
RealD Mass(void) { return mass; };
RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
RealD MassPlus(void) { return mass_plus; };
RealD MassMinus(void) { return mass_minus; };
void SetMass(RealD _mass) {
mass=_mass;
mass_plus=mass_minus=_mass;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void SetMass(RealD _mass_plus, RealD _mass_minus) {
mass_plus=_mass_plus;
mass_minus=_mass_minus;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void P(const FermionField &psi, FermionField &chi);
@ -108,7 +115,7 @@ public:
void MeooeDag5D (const FermionField &in, FermionField &out);
// protected:
RealD mass;
RealD mass_plus, mass_minus;
// Save arguments to SetCoefficientsInternal
Vector<Coeff_t> _gamma;

View File

@ -0,0 +1,333 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
////////////////////////////////////////////
// Standard Clover
// (4+m0) + csw * clover_term
// Exp Clover
// (4+m0) * exp(csw/(4+m0) clover_term)
// = (4+m0) + csw * clover_term + ...
////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
//////////////////////////////////
// Generic Standard Clover
//////////////////////////////////
template<class Impl>
class CloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
GridBase *grid = CloverTerm.Grid();
CloverTerm += diag_mass;
int lvol = grid->lSites();
int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
EigenInvCloverOp = EigenCloverOp.inverse();
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Generic Exp Clover
//////////////////////////////////
template<class Impl>
class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef WilsonCloverHelpers<Impl> Helpers;
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
Clover *= (1.0/diag_mass);
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
// prepare inverse
CloverInv = (-1.0)*Clover;
Clover = ExpClover * diag_mass;
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
CloverInv = ExpClover * (1.0/diag_mass);
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
//////////////////////////////////
// Compact Standard Clover
//////////////////////////////////
template<class Impl>
class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
Clover += diag_mass;
}
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
// TODO: implement Cmunu for better performances with compact layout, but don't do it
// here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Compact Exp Clover
//////////////////////////////////
template<class Impl>
class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
Clover *= (1.0/diag_mass);
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
// prepare inverse
CloverInv = (-1.0)*Clover;
Clover = ExpClover * diag_mass;
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
CloverInv = ExpClover * (1.0/diag_mass);
}
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
if (fixedBoundaries)
{
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
else
{
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,241 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
Copyright (C) 2020 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
//
// Modifications done here:
//
// Original: clover term = 12x12 matrix per site
//
// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
//
// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
//
// Words per site and improvement compared to original (combined with the input and output spinors):
//
// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less
// - Here: 2*12 + 42 = 66 words -> 2.55 x less
//
// These improvements directly translate to wall-clock time
//
// Data layout:
//
// - diagonal and triangle part as separate lattice fields,
// this was faster than as 1 combined field on all tested machines
// - diagonal: as expected
// - triangle: store upper right triangle in row major order
// - graphical:
// 0 1 2 3 4
// 5 6 7 8
// 9 10 11 = upper right triangle indices
// 12 13
// 14
// 0
// 1
// 2
// 3 = diagonal indices
// 4
// 5
// 0
// 1 5
// 2 6 9 = lower left triangle indices
// 3 7 10 12
// 4 8 11 13 14
//
// Impact on total memory consumption:
// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
// = 84 complex words per site
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@ -137,21 +138,52 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
// Clover fermions
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;

View File

@ -4,10 +4,11 @@
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
Copyright (C) 2017
Copyright (C) 2017 - 2022
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: David Preti <>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -29,7 +30,9 @@
#pragma once
#include <Grid/Grid.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
@ -49,19 +52,16 @@ NAMESPACE_BEGIN(Grid);
// csw_r = csw_t to recover the isotropic version
//////////////////////////////////////////////////////////////////
template <class Impl>
class WilsonCloverFermion : public WilsonFermion<Impl>
template<class Impl, class CloverHelpers>
class WilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>
{
public:
// Types definitions
INHERIT_IMPL_TYPES(Impl);
template <typename vtype>
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteCloverType;
typedef Lattice<SiteCloverType> CloverFieldType;
INHERIT_CLOVER_TYPES(Impl);
public:
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
virtual int ConstEE(void) { return 0; };
virtual void Instantiatable(void){};
@ -72,42 +72,7 @@ public:
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
Fgrid,
Hgrid,
_mass, impl_p, clover_anisotropy),
CloverTerm(&Fgrid),
CloverTermInv(&Fgrid),
CloverTermEven(&Hgrid),
CloverTermOdd(&Hgrid),
CloverTermInvEven(&Hgrid),
CloverTermInvOdd(&Hgrid),
CloverTermDagEven(&Hgrid),
CloverTermDagOdd(&Hgrid),
CloverTermInvDagEven(&Hgrid),
CloverTermInvDagOdd(&Hgrid)
{
assert(Nd == 4); // require 4 dimensions
if (clover_anisotropy.isAnisotropic)
{
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
}
else
{
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if (csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if (csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
const ImplParams &impl_p = ImplParams());
virtual void M(const FermionField &in, FermionField &out);
virtual void Mdag(const FermionField &in, FermionField &out);
@ -124,250 +89,21 @@ public:
void ImportGauge(const GaugeField &_Umu);
// Derivative parts unpreconditioned pseudofermions
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
protected:
public:
// here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial
RealD csw_t; // Clover coefficient - temporal
RealD diag_mass; // Mass term
CloverFieldType CloverTerm, CloverTermInv; // Clover term
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()();
T_v[i]()(2, 3) = -F_v[i]()();
T_v[i]()(3, 2) = F_v[i]()();
});
return T;
}
CloverFieldType fillCloverXY(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverYT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()());
T_v[i]()(2, 3) = (F_v[i]()());
T_v[i]()(3, 2) = -(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverZT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
CloverField CloverTerm, CloverTermInv; // Clover term
CloverField CloverTermEven, CloverTermOdd; // Clover term EO
CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,763 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
// Helper routines that implement common clover functionality
NAMESPACE_BEGIN(Grid);
template<class Impl> class WilsonCloverHelpers {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
static CloverField fillCloverYZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
});
return T;
}
static CloverField fillCloverXY(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverYT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
});
return T;
}
static CloverField fillCloverZT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
template<class _Spinor>
static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
auto CC = coalescedRead(C);
mult(&phi, &CC, &chi);
}
template<class _SpinorField>
inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
const int Nsimd = SiteSpinor::Nsimd();
autoView(out_v, out, AcceleratorWrite);
autoView(phi_v, phi, AcceleratorRead);
autoView(C_v, C, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multClover(tmp,C_v[sss],phi_v(sss));
coalescedWrite(out_v[sss],tmp);
});
}
};
////////////////////////////////////////////////////////
template<class Impl> class CompactWilsonCloverHelpers {
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
#if 0
static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#else
template<typename vobj>
static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#endif
static accelerator_inline int triangle_index(int i, int j) {
if(i == j)
return 0;
else if(i < j)
return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
else // i > j
return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
}
static void MooeeKernel_gpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(in_v, in, AcceleratorRead);
autoView(out_v, out, AcceleratorWrite);
typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
const uint64_t NN = Nsite * Ls;
accelerator_for(ss, NN, Simd::Nsimd(), {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v(sF);
auto diagonal_t = diagonal_v(sU);
auto triangle_t = triangle_v(sU);
for(int block=0; block<Nhs; block++) {
int s_start = block*Nhs;
for(int i=0; i<Nred; i++) {
int si = s_start + i/Nc, ci = i%Nc;
res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
for(int j=0; j<Nred; j++) {
if (j == i) continue;
int sj = s_start + j/Nc, cj = j%Nc;
res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
};
};
};
coalescedWrite(out_v[sF], res);
});
}
static void MooeeKernel_cpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(in_v, in, CpuRead);
autoView(out_v, out, CpuWrite);
typedef SiteSpinor CalcSpinor;
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
#define PREFETCH_CLOVER(BASE) { \
uint64_t base; \
int pf_dist_L1 = 1; \
int pf_dist_L2 = -5; /* -> penalty -> disable */ \
\
if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
} \
\
if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
} \
}
// TODO: Implement/generalize this for other architectures
// I played around a bit on KNL (see below) but didn't bring anything
// #elif defined(AVX512)
// #define PREFETCH_CLOVER(BASE) { \
// uint64_t base; \
// int pf_dist_L1 = 1; \
// int pf_dist_L2 = +4; \
// \
// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \
// } \
// \
// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \
// } \
// }
#else
#define PREFETCH_CLOVER(BASE)
#endif
const uint64_t NN = Nsite * Ls;
thread_for(ss, NN, {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v[sF];
auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
auto triangle_t = triangle_v[sU];
// upper half
PREFETCH_CLOVER(0);
auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
auto in_cc_1_0 = conjugate(in_t()(1)(0));
auto in_cc_1_1 = conjugate(in_t()(1)(1));
res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
+ triangle_t()(0)( 0) * in_t()(0)(1)
+ triangle_t()(0)( 1) * in_t()(0)(2)
+ triangle_t()(0)( 2) * in_t()(1)(0)
+ triangle_t()(0)( 3) * in_t()(1)(1)
+ triangle_t()(0)( 4) * in_t()(1)(2);
res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
+ triangle_t()(0)( 5) * in_t()(0)(2)
+ triangle_t()(0)( 6) * in_t()(1)(0)
+ triangle_t()(0)( 7) * in_t()(1)(1)
+ triangle_t()(0)( 8) * in_t()(1)(2)
+ conjugate( res()(0)( 1));
res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
+ triangle_t()(0)( 5) * in_cc_0_1;
res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
+ triangle_t()(0)( 9) * in_t()(1)(0)
+ triangle_t()(0)(10) * in_t()(1)(1)
+ triangle_t()(0)(11) * in_t()(1)(2)
+ conjugate( res()(0)( 2));
res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
+ triangle_t()(0)( 6) * in_cc_0_1
+ triangle_t()(0)( 9) * in_cc_0_2;
res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
+ triangle_t()(0)(12) * in_t()(1)(1)
+ triangle_t()(0)(13) * in_t()(1)(2)
+ conjugate( res()(1)( 0));
res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
+ triangle_t()(0)( 7) * in_cc_0_1
+ triangle_t()(0)(10) * in_cc_0_2
+ triangle_t()(0)(12) * in_cc_1_0;
res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
+ triangle_t()(0)(14) * in_t()(1)(2)
+ conjugate( res()(1)( 1));
res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
+ triangle_t()(0)( 8) * in_cc_0_1
+ triangle_t()(0)(11) * in_cc_0_2
+ triangle_t()(0)(13) * in_cc_1_0
+ triangle_t()(0)(14) * in_cc_1_1;
res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
+ conjugate( res()(1)( 2));
vstream(out_v[sF]()(0)(0), res()(0)(0));
vstream(out_v[sF]()(0)(1), res()(0)(1));
vstream(out_v[sF]()(0)(2), res()(0)(2));
vstream(out_v[sF]()(1)(0), res()(1)(0));
vstream(out_v[sF]()(1)(1), res()(1)(1));
vstream(out_v[sF]()(1)(2), res()(1)(2));
// lower half
PREFETCH_CLOVER(1);
auto in_cc_2_0 = conjugate(in_t()(2)(0));
auto in_cc_2_1 = conjugate(in_t()(2)(1));
auto in_cc_2_2 = conjugate(in_t()(2)(2));
auto in_cc_3_0 = conjugate(in_t()(3)(0));
auto in_cc_3_1 = conjugate(in_t()(3)(1));
res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
+ triangle_t()(1)( 0) * in_t()(2)(1)
+ triangle_t()(1)( 1) * in_t()(2)(2)
+ triangle_t()(1)( 2) * in_t()(3)(0)
+ triangle_t()(1)( 3) * in_t()(3)(1)
+ triangle_t()(1)( 4) * in_t()(3)(2);
res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
+ triangle_t()(1)( 5) * in_t()(2)(2)
+ triangle_t()(1)( 6) * in_t()(3)(0)
+ triangle_t()(1)( 7) * in_t()(3)(1)
+ triangle_t()(1)( 8) * in_t()(3)(2)
+ conjugate( res()(2)( 1));
res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
+ triangle_t()(1)( 5) * in_cc_2_1;
res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
+ triangle_t()(1)( 9) * in_t()(3)(0)
+ triangle_t()(1)(10) * in_t()(3)(1)
+ triangle_t()(1)(11) * in_t()(3)(2)
+ conjugate( res()(2)( 2));
res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
+ triangle_t()(1)( 6) * in_cc_2_1
+ triangle_t()(1)( 9) * in_cc_2_2;
res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
+ triangle_t()(1)(12) * in_t()(3)(1)
+ triangle_t()(1)(13) * in_t()(3)(2)
+ conjugate( res()(3)( 0));
res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
+ triangle_t()(1)( 7) * in_cc_2_1
+ triangle_t()(1)(10) * in_cc_2_2
+ triangle_t()(1)(12) * in_cc_3_0;
res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
+ triangle_t()(1)(14) * in_t()(3)(2)
+ conjugate( res()(3)( 1));
res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
+ triangle_t()(1)( 8) * in_cc_2_1
+ triangle_t()(1)(11) * in_cc_2_2
+ triangle_t()(1)(13) * in_cc_3_0
+ triangle_t()(1)(14) * in_cc_3_1;
res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
+ conjugate( res()(3)( 2));
vstream(out_v[sF]()(2)(0), res()(2)(0));
vstream(out_v[sF]()(2)(1), res()(2)(1));
vstream(out_v[sF]()(2)(2), res()(2)(2));
vstream(out_v[sF]()(3)(0), res()(3)(0));
vstream(out_v[sF]()(3)(1), res()(3)(1));
vstream(out_v[sF]()(3)(2), res()(3)(2));
});
}
static void MooeeKernel(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
#else
MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
#endif
}
static void Invert(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv) {
conformable(diagonal, diagonalInv);
conformable(triangle, triangleInv);
conformable(diagonal, triangle);
diagonalInv.Checkerboard() = diagonal.Checkerboard();
triangleInv.Checkerboard() = triangle.Checkerboard();
GridBase* grid = diagonal.Grid();
long lsites = grid->lSites();
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(diagonalInv_v, diagonalInv, CpuWrite);
autoView(triangleInv_v, triangleInv, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_inv_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_inv_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
// TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
else
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
}
}
}
}
clover_inv_eigen = clover_eigen.inverse();
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else if(i < j)
triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else
continue;
}
}
}
}
pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
});
}
static void ConvertLayout(const CloverField& full,
CloverDiagonalField& diagonal,
CloverTriangleField& triangle) {
conformable(full, diagonal);
conformable(full, triangle);
diagonal.Checkerboard() = full.Checkerboard();
triangle.Checkerboard() = full.Checkerboard();
autoView(full_v, full, AcceleratorRead);
autoView(diagonal_v, diagonal, AcceleratorWrite);
autoView(triangle_v, triangle, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else if(i < j)
triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else
continue;
}
}
}
}
});
}
static void ConvertLayout(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverField& full) {
conformable(full, diagonal);
conformable(full, triangle);
full.Checkerboard() = diagonal.Checkerboard();
full = Zero();
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(full_v, full, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
else
full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
}
}
}
}
});
}
static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
// Checks/grid
double t0 = usecond();
conformable(diagonal, triangle);
GridBase* grid = diagonal.Grid();
// Determine the boundary coordinates/sites
double t1 = usecond();
int t_dir = Nd - 1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
// Set off-diagonal parts at boundary to zero -- OK
double t2 = usecond();
CloverTriangleField zeroTriangle(grid);
zeroTriangle.Checkerboard() = triangle.Checkerboard();
zeroTriangle = Zero();
triangle = where(t_coor == 0, zeroTriangle, triangle);
triangle = where(t_coor == T-1, zeroTriangle, triangle);
// Set diagonal to unity (scaled correctly) -- OK
double t3 = usecond();
CloverDiagonalField tmp(grid);
tmp.Checkerboard() = diagonal.Checkerboard();
tmp = -1.0 * csw_t + diag_mass;
diagonal = where(t_coor == 0, tmp, diagonal);
diagonal = where(t_coor == T-1, tmp, diagonal);
// Correct values next to boundary
double t4 = usecond();
if(cF != 1.0) {
tmp = cF - 1.0;
tmp += diagonal;
diagonal = where(t_coor == 1, tmp, diagonal);
diagonal = where(t_coor == T-2, tmp, diagonal);
}
// Report timings
double t5 = usecond();
#if 0
std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
<< " checks = " << (t1 - t0) / 1e6
<< ", coordinate = " << (t2 - t1) / 1e6
<< ", off-diag zero = " << (t3 - t2) / 1e6
<< ", diagonal unity = " << (t4 - t3) / 1e6
<< ", near-boundary = " << (t5 - t4) / 1e6
<< ", total = " << (t5 - t0) / 1e6
<< std::endl;
#endif
}
template<class Field, class Mask>
static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
conformable(f, m);
auto grid = f.Grid();
const uint32_t Nsite = grid->oSites();
const uint32_t Nsimd = grid->Nsimd();
autoView(f_v, f, AcceleratorWrite);
autoView(m_v, m, AcceleratorRead);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, Nsite, Nsimd, {
coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
});
}
template<class MaskField>
static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd);
assert(!full.Grid()->_isCheckerBoarded);
GridBase* grid = full.Grid();
int t_dir = Nd-1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
MaskField zeroMask(grid); zeroMask = Zero();
full = 1.0;
full = where(t_coor == 0, zeroMask, full);
full = where(t_coor == T-1, zeroMask, full);
pickCheckerboard(Even, even, full);
pickCheckerboard(Odd, odd, full);
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,90 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Impl>
class WilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteClover;
typedef Lattice<SiteClover> CloverField;
};
template<class Impl>
class CompactWilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
static constexpr int Nred = Nc * Nhs; // 6
static constexpr int Nblock = Nhs; // 2
static constexpr int Ndiagonal = Nred; // 6
static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
typedef iSinglet<Simd> SiteMask;
typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
typedef Lattice<SiteCloverTriangle> CloverTriangleField;
typedef Lattice<SiteMask> MaskField;
};
#define INHERIT_CLOVER_TYPES(Impl) \
typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal SiteCloverDiagonal; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle SiteCloverTriangle; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteMask SiteMask; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
typedef typename CompactWilsonCloverTypes<Impl>::MaskField MaskField; \
/* ugly duplication but needed inside functionality classes */ \
template<typename vtype> using iImplCloverDiagonal = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
template<typename vtype> using iImplCloverTriangle = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \
static constexpr int Nred = CompactWilsonCloverTypes<Impl>::Nred; \
static constexpr int Nblock = CompactWilsonCloverTypes<Impl>::Nblock; \
static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
NAMESPACE_END(Grid);

View File

@ -117,19 +117,19 @@ public:
typedef decltype(coalescedRead(*in)) sobj;
typedef decltype(coalescedRead(*out0)) hsobj;
unsigned int Nsimd = vobj::Nsimd();
constexpr unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one
const vobj *vp0 = &in[k];
const vobj *vp1 = &in[m];
const vobj *vp = (lane&mask) ? vp1:vp0;
auto sa = coalescedRead(*vp,j0);
auto sb = coalescedRead(*vp,j1);
const vobj *vp0 = &in[k]; // out0[j] = merge low bit of type from in[k] and in[m]
const vobj *vp1 = &in[m]; // out1[j] = merge hi bit of type from in[k] and in[m]
const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
auto sb = coalescedRead(*vp,j1); // lane to read for out 1
hsobj psa, psb;
projector::Proj(psa,sa,mu,dag);
projector::Proj(psb,sb,mu,dag);
projector::Proj(psa,sa,mu,dag); // spin project the result0
projector::Proj(psb,sb,mu,dag); // spin project the result1
coalescedWrite(out0[j],psa);
coalescedWrite(out1[j],psb);
#else

View File

@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
FiveDimRedBlackGrid,
FourDimGrid,
FourDimRedBlackGrid,_M5,p),
mass(_mass)
mass_plus(_mass), mass_minus(_mass)
{
}
@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
Vector<Coeff_t> diag (Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass;
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
M5D(psi,chi,chi,lower,diag,upper);
}
template<class Impl>
@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
Vector<Coeff_t> diag = bs;
Vector<Coeff_t> upper= cs;
Vector<Coeff_t> lower= cs;
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,Din,lower,diag,upper);
}
// FIXME Redunant with the above routine; check this and eliminate
@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &
upper[i]=-ceo[i];
lower[i]=-ceo[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &
upper[i]=-cee[i];
lower[i]=-cee[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &
// Assemble the 5d matrix
if ( s==0 ) {
upper[s] = -cee[s+1] ;
lower[s] = mass*cee[Ls-1];
lower[s] = mass_minus*cee[Ls-1];
} else if ( s==(Ls-1)) {
upper[s] = mass*cee[0];
upper[s] = mass_plus*cee[0];
lower[s] = -cee[s-1];
} else {
upper[s]=-cee[s+1];
@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0);
Vector<Coeff_t> lower(Ls,-1.0);
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_plus*upper[Ls-1];
lower[0] =-mass_minus*lower[0];
M5Ddag(psi,chi,chi,lower,diag,upper);
}
@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
for (int s=0;s<Ls;s++){
if ( s== 0 ) {
upper[s] = cs[s+1];
lower[s] =-mass*cs[Ls-1];
lower[s] =-mass_minus*cs[Ls-1];
} else if ( s==(Ls-1) ) {
upper[s] =-mass*cs[0];
upper[s] =-mass_plus*cs[0];
lower[s] = cs[s-1];
} else {
upper[s] = cs[s+1];
@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass*cee[Ls-1]/bee[0];
leem[i]=mass_minus*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) {
assert(bee[j+1]!=Coeff_t(0.0));
leem[i]*= aee[j]/bee[j+1];
@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
ueem[i]=mass;
ueem[i]=mass_plus;
for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
ueem[i]*= aee[0]/bee[0];
@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
}
{
Coeff_t delta_d=mass*cee[Ls-1];
Coeff_t delta_d=mass_minus*cee[Ls-1];
for(int j=0;j<Ls-1;j++) {
assert(bee[j] != Coeff_t(0.0));
delta_d *= cee[j]/bee[j];
@ -642,6 +642,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
@ -777,6 +781,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
assert(mu>=0);
assert(mu<Nd);
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if 0
int tshift = (mu == Nd-1) ? 1 : 0;
@ -828,6 +834,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
#if (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
////////////////////////////////////////////////
// GENERAL CAYLEY CASE
////////////////////////////////////////////////
@ -880,7 +887,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
std::vector<RealD> G_s(Ls,1.0);
RealD sign = 1; // sign flip for vector/tadpole
RealD sign = 1.0; // sign flip for vector/tadpole
if ( curr_type == Current::Axial ) {
for(int s=0;s<Ls/2;s++){
G_s[s] = -1.0;
@ -890,7 +897,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
auto b=this->_b;
auto c=this->_c;
if ( b == 1 && c == 0 ) {
sign = -1;
sign = -1.0;
}
else {
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
@ -934,7 +941,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -G_s[s]*( Utmp + gmu*Utmp );
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
// Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
InsertSlice(L_Q, q_out, s , 0);

View File

@ -0,0 +1,377 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
, DiagonalInv(&Fgrid), TriangleInv(&Fgrid)
, DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
, DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid)
, Tmp(&Fgrid)
, BoundaryMask(&Fgrid)
, BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
if (clover_anisotropy.isAnisotropic)
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
conformable(in, diagonal);
conformable(in, triangle);
CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@ -2,12 +2,13 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Copyright (C) 2017
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -33,9 +34,48 @@
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, CloverTerm(&Fgrid)
, CloverTermInv(&Fgrid)
, CloverTermEven(&Hgrid)
, CloverTermOdd(&Hgrid)
, CloverTermInvEven(&Hgrid)
, CloverTermInvOdd(&Hgrid)
, CloverTermDagEven(&Hgrid)
, CloverTermDagOdd(&Hgrid)
, CloverTermInvDagEven(&Hgrid)
, CloverTermInvDagOdd(&Hgrid) {
assert(Nd == 4); // require 4 dimensions
if(clover_anisotropy.isAnisotropic) {
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
} else {
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if(csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if(csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
// *NOT* EO
template <class Impl>
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@ -49,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
out += temp;
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@ -64,13 +104,16 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
out += temp;
}
template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
{
double t0 = usecond();
WilsonFermion<Impl>::ImportGauge(_Umu);
double t1 = usecond();
GridBase *grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
double t2 = usecond();
// Compute the field strength terms mu>nu
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@ -79,52 +122,20 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
double t3 = usecond();
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
CloverTerm = fillCloverYZ(Bx) * csw_r;
CloverTerm += fillCloverXZ(By) * csw_r;
CloverTerm += fillCloverXY(Bz) * csw_r;
CloverTerm += fillCloverXT(Ex) * csw_t;
CloverTerm += fillCloverYT(Ey) * csw_t;
CloverTerm += fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass;
int lvol = _Umu.Grid()->lSites();
int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r;
CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
double t4 = usecond();
CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
double t5 = usecond();
// Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -137,37 +148,47 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
double t6 = usecond();
std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiation = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t6 - t0) / 1e6 << std::endl;
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{
out.Checkerboard() = in.Checkerboard();
CloverFieldType *Clover;
CloverField *Clover;
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
if (dag)
@ -182,12 +203,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
{
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
}
out = *Clover * in;
Helpers::multCloverField(out, *Clover, in);
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = adj(*Clover) * in;
Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
}
}
else
@ -205,29 +226,109 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
// std::cout << "Calling clover term Even" << std::endl;
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
}
out = *Clover * in;
Helpers::multCloverField(out, *Clover, in);
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = *Clover * in;
Helpers::multCloverField(out, *Clover, in);
}
}
} // MooeeInternal
// Derivative parts unpreconditioned pseudofermions
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
{
assert(0);
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
assert(0); // not implemented yet
}

View File

@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
Copyright (C) 2015
Copyright (C) 2022
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -599,11 +600,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
Gamma g5(Gamma::Algebra::Gamma5);
conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid());
assert(0);
auto UGrid= this->GaugeGrid();
PropagatorField tmp_shifted(UGrid);
PropagatorField g5Lg5(UGrid);
PropagatorField R(UGrid);
PropagatorField gmuR(UGrid);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
g5Lg5=g5*q_in_1*g5;
tmp_shifted=Cshift(q_in_2,mu,1);
Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
gmuR=gmu*R;
q_out=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
tmp_shifted=Cshift(q_in_1,mu,1);
Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
g5Lg5=g5*g5Lg5*g5;
R=q_in_2;
gmuR=gmu*R;
q_out-=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
}
@ -617,9 +654,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid());
assert(0);
auto UGrid= this->GaugeGrid();
PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid);
PropagatorField L(UGrid);
PropagatorField zz (UGrid);
zz=Zero();
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
tmp = Cshift(q_in,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
tmp = q_in *lattice_cmplx;
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -( Utmp + gmu*Utmp );
// Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
}
NAMESPACE_END(Grid);

View File

@ -77,23 +77,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define REGISTER
#ifdef GRID_SIMT
#define LOAD_CHIMU(ptype) \
#define LOAD_CHIMU(Ptype) \
{const SiteSpinor & ref (in[offset]); \
Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane); \
Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane); \
Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane); \
Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane); \
Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane); \
Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane); \
Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane); \
Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane); \
Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane); \
Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane); \
Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane); \
Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane); }
Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane); \
Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane); \
Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane); \
Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane); \
Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane); \
Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane); \
Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane); \
Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane); \
Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane); \
Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane); \
Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane); \
Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane); }
#define PERMUTE_DIR(dir) ;
#else
#define LOAD_CHIMU(ptype) \
#define LOAD_CHIMU(Ptype) \
{const SiteSpinor & ref (in[offset]); \
Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\
@ -109,12 +109,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Chimu_32=ref()(3)(2);}
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00); \
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10); \
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);
permute##dir(Chi_00,Chi_00); \
permute##dir(Chi_01,Chi_01); \
permute##dir(Chi_02,Chi_02); \
permute##dir(Chi_10,Chi_10); \
permute##dir(Chi_11,Chi_11); \
permute##dir(Chi_12,Chi_12);
#endif
@ -371,88 +371,91 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
result_32-= UChi_12;
#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON;
{int ptype; \
SE=st.GetEntry(ptype,DIR,ss); \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
SE=&st_p[DIR+8*ss]; \
ptype=st_perm[DIR]; \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
{ SE=&st_p[DIR+8*ss]; \
auto ptype=st_perm[DIR]; \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \
SE=&st_p[DIR+8*ss]; \
ptype=st_perm[DIR]; \
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
offset = SE->_offset; \
perm = SE->_permute; \
LOAD_CHIMU(PERM); \
PROJ; \
MULT_2SPIN(DIR); \
RECON;
{ SE=&st_p[DIR+8*ss]; \
auto ptype=st_perm[DIR]; \
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
auto offset = SE->_offset; \
auto perm = SE->_permute; \
LOAD_CHIMU(PERM); \
PROJ; \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN(DIR); \
RECON; \
} \
acceleratorSynchronise();
{ int ptype; \
SE=st.GetEntry(ptype,DIR,ss); \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN(DIR); \
RECON; \
} \
acceleratorSynchronise(); }
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
} \
acceleratorSynchronise();
{ int ptype; \
SE=st.GetEntry(ptype,DIR,ss); \
auto offset = SE->_offset; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
} \
acceleratorSynchronise(); }
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out[ss]); \
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out[ss]); \
coalescedWrite(ref()(0)(0),result_00,lane); \
coalescedWrite(ref()(0)(1),result_01,lane); \
coalescedWrite(ref()(0)(2),result_02,lane); \
@ -563,7 +566,6 @@ WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p,
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
@ -593,9 +595,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@ -623,8 +623,6 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
int offset,local,perm, ptype;
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@ -640,8 +638,8 @@ template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
@ -652,7 +650,6 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@ -670,8 +667,8 @@ template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@ -682,7 +679,6 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
int offset,local,perm, ptype;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@ -699,8 +695,8 @@ template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
@ -711,7 +707,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_DECLARATIONS(Simt);
int offset, ptype;
// int offset, ptype;
StencilEntry *SE;
int nmu=0;
ZERO_RESULT;
@ -730,8 +726,8 @@ template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
@ -742,7 +738,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
int offset, ptype;
// int offset, ptype;
int nmu=0;
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);

View File

@ -498,6 +498,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
acceleratorFenceComputeStream();
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
@ -505,11 +506,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif
acceleratorFenceComputeStream();
}
assert(0 && " Kernel optimisation case not covered ");
}

View File

@ -0,0 +1,44 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -8,7 +8,8 @@
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
@ -31,10 +32,12 @@
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonCloverFermion<IMPLEMENTATION>;
template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>;
template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../CompactWilsonCloverFermionInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -0,0 +1 @@
../CompactWilsonCloverFermionInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -18,6 +18,10 @@ WILSON_IMPL_LIST=" \
GparityWilsonImplF \
GparityWilsonImplD "
COMPACT_WILSON_IMPL_LIST=" \
WilsonImplF \
WilsonImplD "
DWF_IMPL_LIST=" \
WilsonImplF \
WilsonImplD \
@ -46,7 +50,17 @@ for impl in $WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
CC_LIST="CompactWilsonCloverFermionInstantiation"
for impl in $COMPACT_WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
@ -63,14 +77,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
# overwrite the .cc file in Gparity directories
for impl in $GDWF_IMPL_LIST
do
ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
done
@ -84,7 +98,7 @@ for impl in $STAG_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done

View File

@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field;
typedef Field ComplexField;
typedef LinkField ComplexField;
};
typedef QedGImpl<vComplex> QedGImplR;

View File

@ -1,61 +1,63 @@
Using HMC in Grid version 0.5.1
# Using HMC in Grid
These are the instructions to use the Generalised HMC on Grid version 0.5.1.
Disclaimer: GRID is still under active development so any information here can be changed in future releases.
These are the instructions to use the Generalised HMC on Grid as of commit `749b802`.
Disclaimer: Grid is still under active development so any information here can be changed in future releases.
Command line options
===================
(relevant file GenericHMCrunner.h)
## Command line options
(relevant file `GenericHMCrunner.h`)
The initial configuration can be changed at the command line using
--StartType <your choice>
valid choices, one among these
HotStart, ColdStart, TepidStart, CheckpointStart
default: HotStart
`--StartingType STARTING_TYPE`, where `STARTING_TYPE` is one of
`HotStart`, `ColdStart`, `TepidStart`, and `CheckpointStart`.
Default: `--StartingType HotStart`
example
./My_hmc_exec --StartType HotStart
Example:
```
./My_hmc_exec --StartingType HotStart
```
The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
default: 0
The `CheckpointStart` option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
`--StartingTrajectory STARTING_TRAJECTORY`, where `STARTING_TRAJECTORY` is an integer.
Default: `--StartingTrajectory 0`
The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
default: 1
`--Trajectories TRAJECTORIES`, where `TRAJECTORIES` is an integer.
Default: `--Trajectories 1`
The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
default: 10
`--Thermalizations THERMALIZATIONS`, where `THERMALIZATIONS` is an integer.
Default: `--Thermalizations 10`
Any other parameter is defined in the source for the executable.
HMC controls
===========
## HMC controls
The lines
```
std::vector<int> SerSeed({1, 2, 3, 4, 5});
std::vector<int> ParSeed({6, 7, 8, 9, 10});
```
define the seeds for the serial and the parallel RNG.
The line
```
TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
```
declares the number of molecular dynamics steps and the total trajectory length.
Actions
======
## Actions
Action names are defined in the file
lib/qcd/Actions.h
Action names are defined in the directory `Grid/qcd/action`.
Gauge actions list:
Gauge actions list (from `Grid/qcd/action/gauge/Gauge.h`):
```
WilsonGaugeActionR;
WilsonGaugeActionF;
WilsonGaugeActionD;
@ -68,8 +70,9 @@ IwasakiGaugeActionD;
SymanzikGaugeActionR;
SymanzikGaugeActionF;
SymanzikGaugeActionD;
```
```
ConjugateWilsonGaugeActionR;
ConjugateWilsonGaugeActionF;
ConjugateWilsonGaugeActionD;
@ -82,26 +85,23 @@ ConjugateIwasakiGaugeActionD;
ConjugateSymanzikGaugeActionR;
ConjugateSymanzikGaugeActionF;
ConjugateSymanzikGaugeActionD;
```
Each of these action accepts one single parameter at creation time (beta).
Example for creating a Symanzik action with beta=4.0
```
SymanzikGaugeActionR(4.0)
```
Scalar actions list (from `Grid/qcd/action/scalar/Scalar.h`):
```
ScalarActionR;
ScalarActionF;
ScalarActionD;
```
each of these action accept one single parameter at creation time (beta).
Example for creating a Symanzik action with beta=4.0
SymanzikGaugeActionR(4.0)
The suffixes R,F,D in the action names refer to the Real
(the precision is defined at compile time by the --enable-precision flag in the configure),
Float and Double, that force the precision of the action to be 32, 64 bit respectively.
The suffixes `R`, `F`, `D` in the action names refer to the `Real`
(the precision is defined at compile time by the `--enable-precision` flag in the configure),
`Float` and `Double`, that force the precision of the action to be 32, 64 bit respectively.

View File

@ -55,12 +55,12 @@ public:
}
}
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
GridBase *grid = Umu.Grid();
GaugeMat xform(grid);
SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
}
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
GridBase *grid = Umu.Grid();
@ -122,6 +122,8 @@ public:
}
}
std::cout << GridLogError << "Gauge fixing did not converge in " << maxiter << " iterations." << std::endl;
if (err_on_no_converge) assert(0);
};
static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
GridBase *grid = U[0].Grid();

View File

@ -125,7 +125,6 @@ public:
return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
}
//////////////////////////////////////////////////
// average over all x,y,z the temporal loop
//////////////////////////////////////////////////
@ -165,7 +164,7 @@ public:
double vol = Umu.Grid()->gSites();
return p.real() / vol / 4.0 / 3.0;
return p.real() / vol / (4.0 * Nc ) ;
};
//////////////////////////////////////////////////

View File

@ -26,7 +26,7 @@
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#ifndef GRID_HIP
NAMESPACE_BEGIN(Grid);
@ -82,7 +82,7 @@ void JSONWriter::writeDefault(const std::string &s, const std::string &x)
if (s.size())
ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
else
ss_ << os.str() << " ," ;
ss_ << "\""<< os.str() << "\" ," ;
}
// Reader implementation ///////////////////////////////////////////////////////

View File

@ -54,7 +54,7 @@ namespace Grid
void pop(void);
template <typename U>
void writeDefault(const std::string &s, const U &x);
#ifdef __NVCC__
#if defined(GRID_CUDA) || defined(GRID_HIP)
void writeDefault(const std::string &s, const Grid::ComplexD &x)
{
std::complex<double> z(real(x),imag(x));
@ -101,7 +101,7 @@ namespace Grid
void readDefault(const std::string &s, std::vector<U> &output);
template <typename U, typename P>
void readDefault(const std::string &s, std::pair<U,P> &output);
#ifdef __NVCC__
#if defined(GRID_CUDA) || defined(GRID_HIP)
void readDefault(const std::string &s, ComplexD &output)
{
std::complex<double> z;

View File

@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "BinaryIO.h"
#include "TextIO.h"
#include "XmlIO.h"
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#ifndef GRID_HIP
#include "JSON_IO.h"
#endif

View File

@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type)
{
assert( (table.size()&0x1)==0);
@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, &rhs_v[0], j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
});
rhs_v.ViewClose();
}
@ -230,8 +234,8 @@ public:
};
struct Merge {
cobj * mpointer;
Vector<scalar_object *> rpointers;
Vector<cobj *> vpointers;
// std::vector<scalar_object *> rpointers;
std::vector<cobj *> vpointers;
Integer buffer_size;
Integer type;
};
@ -322,8 +326,8 @@ public:
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int recv_from_rank;
int xmit_to_rank;
// int recv_from_rank;
// int xmit_to_rank;
if ( ! comm_dim ) return 1;
if ( displacement == 0 ) return 1;
@ -406,6 +410,7 @@ public:
comms_bytes+=bytes;
shm_bytes +=2*Packets[i].bytes-bytes;
}
_grid->StencilBarrier();// Synch shared memory on a single nodes
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@ -420,7 +425,7 @@ public:
////////////////////////////////////////////////////////////////////////
void Communicate(void)
{
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
if ( 0 ){
thread_region {
// must be called in parallel region
int mythread = thread_num();
@ -569,7 +574,7 @@ public:
d.buffer_size = buffer_size;
dv.push_back(d);
}
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m;
m.type = type;
m.mpointer = merge_p;
@ -582,6 +587,7 @@ public:
}
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
mpi3synctime-=usecond();
accelerator_barrier();
_grid->StencilBarrier();// Synch shared memory on a single nodes
mpi3synctime+=usecond();
shmmergetime-=usecond();
@ -1114,8 +1120,8 @@ public:
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
Vector<cobj *> rpointers(maxl);
Vector<cobj *> spointers(maxl);
std::vector<cobj *> rpointers(maxl);
std::vector<cobj *> spointers(maxl);
///////////////////////////////////////////
// Work out what to send where

View File

@ -47,20 +47,20 @@ NAMESPACE_BEGIN(Grid);
class TypePair {
public:
T _internal[2];
TypePair<T>& operator=(const Grid::Zero& o) {
accelerator TypePair<T>& operator=(const Grid::Zero& o) {
_internal[0] = Zero();
_internal[1] = Zero();
return *this;
}
TypePair<T> operator+(const TypePair<T>& o) const {
accelerator TypePair<T> operator+(const TypePair<T>& o) const {
TypePair<T> r;
r._internal[0] = _internal[0] + o._internal[0];
r._internal[1] = _internal[1] + o._internal[1];
return r;
}
TypePair<T>& operator+=(const TypePair<T>& o) {
accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
_internal[0] += o._internal[0];
_internal[1] += o._internal[1];
return *this;

View File

@ -74,29 +74,43 @@ void acceleratorInit(void)
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_DEFAULT_GPU
int device = 0;
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) {
printf("AcceleratorCudaInit: using default device \n");
printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n");
printf("AcceleratorCudaInit: assume user either uses\n");
printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
}
#else
int device = rank;
printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
cudaSetDevice(rank);
#endif
cudaSetDevice(device);
cudaStreamCreate(&copyStream);
const int len=64;
char busid[len];
if( rank == world_rank ) {
cudaDeviceGetPCIBusId(busid, len, device);
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
}
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
}
#endif
#ifdef GRID_HIP
hipDeviceProp_t *gpu_props;
hipStream_t copyStream;
void acceleratorInit(void)
{
int nDevices = 1;
@ -154,16 +168,25 @@ void acceleratorInit(void)
#ifdef GRID_DEFAULT_GPU
if ( world_rank == 0 ) {
printf("AcceleratorHipInit: using default device \n");
printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n");
printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
}
int device = 0;
#else
if ( world_rank == 0 ) {
printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n");
printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
}
hipSetDevice(rank);
int device = rank;
#endif
hipSetDevice(device);
hipStreamCreate(&copyStream);
const int len=64;
char busid[len];
if( rank == world_rank ) {
hipDeviceGetPCIBusId(busid, len, device);
printf("local rank %d device %d bus id: %s\n", rank, device, busid);
}
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
}
#endif
@ -172,12 +195,15 @@ void acceleratorInit(void)
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
cl::sycl::queue *theCopyAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
// theCopyAccelerator = new sycl::queue (selectedDevice);
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
zeInit(0);

View File

@ -95,6 +95,7 @@ void acceleratorInit(void);
//////////////////////////////////////////////
#ifdef GRID_CUDA
#include <cuda.h>
#ifdef __CUDA_ARCH__
@ -115,6 +116,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#endif
} // CUDA specific
inline void cuda_mem(void)
{
size_t free_t,total_t,used_t;
cudaMemGetInfo(&free_t,&total_t);
used_t=total_t-free_t;
std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
}
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
int nt=acceleratorThreads(); \
@ -221,6 +230,7 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline int acceleratorIsCommunicable(void *ptr)
{
// int uvm=0;
@ -237,7 +247,6 @@ inline int acceleratorIsCommunicable(void *ptr)
//////////////////////////////////////////////
// SyCL acceleration
//////////////////////////////////////////////
#ifdef GRID_SYCL
NAMESPACE_END(Grid);
#include <CL/sycl.hpp>
@ -252,6 +261,7 @@ NAMESPACE_END(Grid);
NAMESPACE_BEGIN(Grid);
extern cl::sycl::queue *theGridAccelerator;
extern cl::sycl::queue *theCopyAccelerator;
#ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT
@ -279,7 +289,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(8)]] \
[[intel::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
@ -288,19 +298,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
}); \
});
#define accelerator_barrier(dummy) theGridAccelerator->wait();
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
theGridAccelerator->memcpy(to,from,bytes);
}
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr)
{
#if 0
@ -328,10 +338,11 @@ NAMESPACE_BEGIN(Grid);
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
extern hipStream_t copyStream;
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return hipThreadIdx_z;
return hipThreadIdx_x;
#else
return 0;
#endif
@ -345,19 +356,41 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
{ __VA_ARGS__;} \
}; \
int nt=acceleratorThreads(); \
dim3 hip_threads(nt,1,nsimd); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd,lambda); \
dim3 hip_threads(nsimd, nt, 1); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd, lambda); \
} else { \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd, lambda); \
} \
}
template<typename lambda> __global__
__launch_bounds__(64,1)
void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
// Following the same scheme as CUDA for now
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
uint64_t z = threadIdx.x;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
}
template<typename lambda> __global__
__launch_bounds__(1024,1)
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
// Following the same scheme as CUDA for now
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
uint64_t z = threadIdx.x;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
@ -402,10 +435,16 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
inline void acceleratorCopySynchronise(void) { }
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
//inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToDevice,copyStream);
}
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
#endif
//////////////////////////////////////////////
@ -442,9 +481,10 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
@ -471,23 +511,26 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
#endif
//////////////////////////////////////////////
// Fencing needed ONLY for SYCL
//////////////////////////////////////////////
#ifdef GRID_SYCL
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
#else
// Ordering within a stream guaranteed on Nvidia & AMD
inline void acceleratorFenceComputeStream(void){ };
#endif
///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch
///////////////////////////////////////////////////
accelerator_inline void acceleratorSynchronise(void)
accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__syncwarp();
#endif
#ifdef GRID_SYCL
//cl::sycl::detail::workGroupBarrier();
#endif
#ifdef GRID_HIP
__syncthreads();
#endif
#endif
return;
}

View File

@ -72,3 +72,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define thread_region DO_PRAGMA(omp parallel)
#define thread_critical DO_PRAGMA(omp critical)
#ifdef GRID_OMP
inline void thread_bcopy(void *from, void *to,size_t bytes)
{
uint64_t *ufrom = (uint64_t *)from;
uint64_t *uto = (uint64_t *)to;
assert(bytes%8==0);
uint64_t words=bytes/8;
thread_for(w,words,{
uto[w] = ufrom[w];
});
}
#else
inline void thread_bcopy(void *from, void *to,size_t bytes)
{
bcopy(from,to,bytes);
}
#endif

View File

@ -88,7 +88,7 @@ public:
// Coordinate class, maxdims = 8 for now.
////////////////////////////////////////////////////////////////
#define GRID_MAX_LATTICE_DIMENSION (8)
#define GRID_MAX_SIMD (16)
#define GRID_MAX_SIMD (32)
static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION;

View File

@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
return;
}
void GridCmdOptionFloat(std::string &str,float & val)
{
std::stringstream ss(str);
ss>>val;
return;
}
void GridParseLayout(char **argv,int argc,
Coordinate &latt_c,
@ -527,6 +534,7 @@ void Grid_init(int *argc,char ***argv)
void Grid_finalize(void)
{
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
Grid_unquiesce_nodes();
#endif

View File

@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
template<class VectorInt>
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
void GridCmdOptionInt(std::string &str,int & val);
void GridCmdOptionFloat(std::string &str,float & val);
void GridParseLayout(char **argv,int argc,

View File

@ -27,6 +27,7 @@
/* END LEGAL */
extern "C" {
#include <openssl/sha.h>
#include <openssl/evp.h>
}
#ifdef USE_IPP
#include "ipp.h"
@ -70,10 +71,8 @@ public:
static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
{
std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
SHA256_CTX sha256;
SHA256_Init (&sha256);
SHA256_Update(&sha256, data,bytes);
SHA256_Final (&hash[0], &sha256);
auto digest = EVP_get_digestbyname("SHA256");
EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL);
return hash;
}
static inline std::vector<int> sha256_seeds(const std::string &s)

View File

@ -148,7 +148,7 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional).
- `--enable-numa`: enable NUMA first touch optimisation
- `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 64 bytes).
- `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers.

View File

@ -137,7 +137,7 @@ int main (int argc, char ** argv)
Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4);
Eigen::VectorXd avMean(4), avStdDev(4), avRob(4);
double n = BENCH_IO_NPASS;
// double n = BENCH_IO_NPASS;
stats(mean, stdDev, perf);
stats(avMean, avStdDev, avPerf);
@ -164,7 +164,7 @@ int main (int argc, char ** argv)
mean(volInd(l), gWrite), stdDev(volInd(l), gWrite));
}
MSG << std::endl;
MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << "Robustness of individual results, in %. (rob = 100% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%4s %12s %12s %12s %12s\n",
"L", "std read", "std write", "Grid read", "Grid write");
@ -185,7 +185,7 @@ int main (int argc, char ** argv)
avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite),
avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite));
MSG << std::endl;
MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl;
MSG << "Robustness of volume-averaged results, in %. (rob = 100% - std dev / mean)" << std::endl;
MSG << std::endl;
grid_printf("%12s %12s %12s %12s\n",
"std read", "std write", "Grid read", "Grid write");

View File

@ -142,7 +142,7 @@ public:
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
// int ncomm;
double dbytes;
for(int dir=0;dir<8;dir++) {
@ -290,7 +290,7 @@ public:
LatticeSU4 z(&Grid); z=Zero();
LatticeSU4 x(&Grid); x=Zero();
LatticeSU4 y(&Grid); y=Zero();
double a=2.0;
// double a=2.0;
uint64_t Nloop=NLOOP;

View File

@ -72,7 +72,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
std::vector<double> t_time(Nloop);
time_statistics timestat;
// time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange from host memory "<<std::endl;

View File

@ -126,19 +126,10 @@ int main (int argc, char ** argv)
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,FGrid);
{
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
@ -147,10 +138,28 @@ int main (int argc, char ** argv)
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
@ -182,7 +191,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =3000;
int ncall =300;
if (1) {
FGrid->Barrier();
@ -242,16 +251,30 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
tmp = Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
int i=s+Ls*ss;
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
}
}
}
tmp =adj(U[mu])*src;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
// tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);

Some files were not shown because too many files have changed in this diff Show More