1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-24 01:34:47 +01:00

Compare commits

..

542 Commits

Author SHA1 Message Date
Peter Boyle
97a098636d FermToProp 2022-11-30 15:36:35 -05:00
Peter Boyle
e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
Peter Boyle
0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
Peter Boyle
7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
Peter Boyle
5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
Peter Boyle
1873101362 PVC 2022-11-08 13:22:45 -08:00
Peter Boyle
63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
Peter Boyle
bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
Peter Boyle
82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
Peter Boyle
62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
Peter Boyle
477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
Peter Boyle
0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
Peter Boyle
413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
Peter Boyle
03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
Peter Boyle
e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
Peter Boyle
9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
Peter Boyle
913fbca74a Merge pull request #410 from gkanwar/photon_and_sha_patches
Photon.h and SHA256 patches
2022-08-31 18:01:45 -04:00
Gurtej Kanwar
60dfb49afa Remove FP16 tests when FP16 is disabled 2022-08-21 17:29:55 +02:00
Gurtej Kanwar
554c238359 Update OpenSSL digest to use high-level methods
This avoids deprecation warnings when compiling against OpenSSL 3.0
but should still be backwards compatible. It is the recommended way
to use the digest API going forward.
2022-08-21 17:28:57 +02:00
Gurtej Kanwar
f922adf05e Fix Photon ComplexField type 2022-08-21 16:16:18 +02:00
Peter Boyle
188d2c7a4d PVC default, ignore ATS 2022-08-02 08:38:53 -07:00
Peter Boyle
17d7177105 Files for SYCL 2022-08-02 08:33:39 -07:00
Peter Boyle
bb0a0da47a inon blocking caution due to SYCL 2022-08-02 08:09:43 -07:00
Peter Boyle
84110166e4 Fix the fence 2022-08-02 08:00:43 -07:00
Peter Boyle
d32b923b6c Fencing on a stream in SYCL is needed. Didn't know that ... gulp 2022-08-02 07:58:04 -07:00
Peter Boyle
2ab1af5754 Ensure no synchronize and not optoin dependent 2022-07-19 09:51:06 -07:00
Peter Boyle
5f8892bf03 Mistake pointed out by Camilo 2022-07-19 09:31:51 -07:00
Peter Boyle
f14e7e51e7 Grid accelerator 2022-07-12 10:56:22 -07:00
Peter Boyle
042ab1a052 Update GridStd.h 2022-06-27 13:21:39 -04:00
Peter Boyle
2df98a99bc Merge pull request #406 from giordano/patch-1
Update default value of gen-simd-width in README
2022-06-14 17:46:25 -04:00
Mosè Giordano
315ea18be2 Update default value of gen-simd-width in README 2022-06-14 22:41:05 +01:00
Peter Boyle
a9c2e1df03 Merge pull request #404 from rrhodgson/feature/json_nvcc
Feature/json nvcc
2022-05-25 13:30:11 -04:00
da4daea57a Updated json to latest release 3.10.5 2022-05-24 16:16:06 +01:00
Peter Boyle
af3b065add Merge pull request #403 from fjosw/fix/cuda_11_5_warnings
Fixed nvcc 11.5+ warnings
2022-05-24 11:10:02 -04:00
e346154c5d Updated json CUDA compile guards 2022-05-24 15:48:01 +01:00
7937ac2bab fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc 2022-05-24 15:31:03 +01:00
e909aeedf0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h 2022-05-24 15:29:42 +01:00
bab8aa8eb0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT
standard in DisableWarnings.h
2022-05-24 15:27:40 +01:00
Peter Boyle
38b22f05be Merge pull request #402 from fjosw/fix/clover_warnings
fixed clover warnings
2022-05-24 10:05:27 -04:00
3ca0de1c40 Fix json write for vector<string> 2022-05-24 14:37:33 +01:00
c7205d2a73 Removed nvcc guards for json 2022-05-24 14:30:26 +01:00
617c5362c1 fix: fixed warning: missing return statement at end of non-void function
in CloverHelpers
2022-05-24 11:37:33 +01:00
Peter Boyle
083b58e66d Merge pull request #401 from JPRichings/LocalCoheranceDeflation
Local coherance batch deflation
2022-05-20 11:44:22 -04:00
Peter Boyle
633427a2df Merge pull request #400 from JPRichings/wilson_sweep
bench wilson sweep fix
2022-05-20 11:43:40 -04:00
2031d6910a Merge branch 'paboyle:develop' into wilson_sweep 2022-05-20 16:20:23 +01:00
79e34b3eb4 Local Coherence batch deflation 2022-05-19 14:53:17 +01:00
4f3d581ab4 Merge branch 'paboyle:develop' into LocalCoheranceDeflation 2022-05-19 14:46:17 +01:00
Peter Boyle
d16427b837 Merge pull request #399 from fjosw/fix/Nc_neq_3
fix: assert for dimensions of compact Wilson clover moved to constructor
2022-05-17 09:03:42 -04:00
4b1997e2f3 wilson sweep test 2022-05-16 15:58:33 +01:00
8939d5dc73 bugfix: eo operator called in correct location 2022-05-16 00:28:28 +01:00
b051e00de0 Additional Local Coherance Deflation operator() 2022-05-16 00:25:13 +01:00
8aa75b492f Merge branch 'develop' into fix/Nc_neq_3 2022-05-10 14:22:03 +01:00
Peter Boyle
0274f40686 Merge pull request #389 from mbruno46/mbruno-eclover
Feature/expClover
2022-05-10 09:18:19 -04:00
Peter Boyle
77aa147ce5 Merge branch 'develop' into mbruno-eclover 2022-05-10 09:16:53 -04:00
32facbd02a fix: assert for dimensions of compact Wilson clover moved to
constructor.
2022-05-10 10:53:22 +01:00
Peter Boyle
4de50ab146 Merge pull request #396 from fjosw/fix/readd_config.h
fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am
2022-05-09 08:26:48 -04:00
8b12a61097 fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am 2022-05-09 11:53:22 +01:00
Peter Boyle
79ea027c0b Merge pull request #377 from RJHudspith/develop
NERSC and ILDG for non-SU(3) configuration checkpoints
2022-05-03 08:55:48 -04:00
Peter Boyle
62339d437f Merge pull request #387 from lehner/feature/gpt
Parity mass terms for domain wall fermions to enable 4d eofa
2022-05-03 08:52:18 -04:00
Peter Boyle
698e745276 Merge pull request #390 from fjosw/feature/conserved_current_wilson
Conserved current for wilson fermions
2022-05-03 08:51:10 -04:00
Peter Boyle
9a6e2c315d Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge
SteepestDescentGaugeFix now exits when the algorithm does not converge.
2022-05-03 08:49:26 -04:00
e61fed87db SteepestDescentGaugeFix now exits when the algorithm does not converge.
This behaviour can be altered by setting err_on_no_converge to false.
2022-04-20 15:41:55 +01:00
b8bc560b51 Test_wilson_conserved_current implemented, all 5d references removed. 2022-04-05 17:33:45 +01:00
6bc2483d57 Merge branch 'feature/eclover' into feature/conserved_current_wilson 2022-04-05 15:26:49 +01:00
82aecbf4cf Test_wilson_conserved_current added 2022-04-05 15:26:39 +01:00
Mattia Bruno
ee23a76aa0 Merge pull request #2 from fjosw/feature/eclover
Feature/eclover
2022-04-05 13:30:13 +02:00
d7191e5a02 SeqConservedCurrent implemented for Wilson fermions 2022-04-05 11:48:56 +01:00
c8a824425b Error message added if another conserved current than vector is requested for
Wilson type fermions.
2022-04-05 10:58:22 +01:00
f23626a6b8 End scope by additional block in CloverHelpers.h 2022-04-02 16:08:15 +01:00
6577a03d16 Explcitly closed views in Exponentiate_Clover 2022-04-01 18:39:12 +01:00
427c8695fe Change signs and prefactors for conserved current to mimic the 5d
version.
2022-04-01 16:20:21 +01:00
9e82c468ab Multiplication of diagonal mass in exponentiate fixed for gpus 2022-04-01 15:54:43 +01:00
603fd96747 Missing link multiplication added. 2022-04-01 10:58:56 +01:00
fe993c0836 /=2 replaced by *=0.5 2022-03-31 17:08:17 +01:00
cdf31d52c1 GaugeGrid and typo fixed 2022-03-31 17:04:35 +01:00
0542eaf1da First version of conserved current contraction for Wilson type quarks 2022-03-31 17:02:09 +01:00
Christoph Lehner
317bdcf158 nerscio parametrization 2022-03-24 13:10:47 +01:00
Mattia Bruno
9ca2c98882 Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover 2022-03-22 15:31:37 +01:00
Peter Boyle
605cf401e1 Merge branch 'feature/sumd-npr' into develop 2022-03-16 22:43:12 +00:00
Peter Boyle
f99c3660d2 Merge branch 'feature/cpu-threaded-smp' into develop 2022-03-16 22:07:54 +00:00
Peter Boyle
92a83a9eb3 Performance improve for Tesseract 2022-03-16 17:14:36 +00:00
Mattia Bruno
53ae01a34a Merge pull request #1 from fjosw/feature/eclover
Feature/eclover
2022-03-15 15:23:35 +01:00
Peter Boyle
b615fa0f35 Merge pull request #388 from fjosw/feature/sumd-npr
Feature/sumd npr
2022-03-15 09:05:57 -04:00
Christoph Lehner
76c294a7ba open bc fix 2022-03-08 13:55:16 +01:00
0c0c2b1e20 Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed. 2022-03-08 09:44:51 +00:00
Christoph Lehner
e2fc3a0f04 Merge pull request #28 from paboyle/develop
Sync with Upstream
2022-03-08 09:58:51 +01:00
451e7972fd Reintroduced explicit inversion of the Clover term in case of the
CompactExpClover because of the open boundary O(a) improvement. Changed
the timing output to GridLogDebug
2022-03-07 17:43:33 +00:00
56c089d347 Removed leftover comments 2022-03-07 16:40:20 +00:00
acf740e44d Merge pull request #1 from FelixPGZiegler/feature/eclover
Feature/eclover
2022-03-07 16:25:11 +00:00
182f513404 Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover 2022-03-07 15:22:04 +00:00
d5b2323a57 included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover 2022-03-07 14:44:24 +00:00
FelixPGZiegler
bad18d4417 Merge branch 'paboyle:develop' into feature/eclover 2022-03-07 13:54:10 +00:00
d1decee4cc Cleaned up unused variables in Lattice_reduction_gpu.h 2022-03-02 16:54:23 +00:00
d4ae71b880 sum_gpu_large and sum_gpu templates added. 2022-03-02 15:40:18 +00:00
Peter Boyle
e16fc5b2e4 Threaded intranode comms transfer - ideally between NUMA domains 2022-03-01 11:17:24 -05:00
Peter Boyle
694306f202 Configure for mac arm 2022-03-01 10:53:44 -05:00
Peter Boyle
9aac1e6d64 Merge branch 'develop' into feature/sumd-npr 2022-03-01 10:51:38 -05:00
Peter Boyle
3e882f555d Large / small sumD options 2022-03-01 08:54:45 -05:00
438caab25f generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed. 2022-02-27 18:27:18 +00:00
239e2c1ee6 tests: wilson clover cg tests now include compact variant as well as
exponential wilson clover operators
2022-02-27 18:26:34 +00:00
013dc2ef33 tests: core tests for wilson clover and wilson exp clover including
compact version extended/added
2022-02-27 18:13:47 +00:00
Christoph Lehner
9616811c3d Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2022-02-24 22:03:05 +01:00
Christoph Lehner
8a3002c03b separate left and right masses for CayleyFermion5D 2022-02-24 22:02:56 +01:00
Mattia Bruno
71034f828e attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig 2022-02-23 01:02:27 +01:00
Mattia Bruno
11437930c5 cleaned up definitions of wilsonclover fermions 2022-02-22 10:45:16 +01:00
Mattia Bruno
3d44aa9cb9 cleaned up cloverhelpers; fixed test compact_clover which runs 2022-02-22 01:10:19 +01:00
Mattia Bruno
2851870d70 expClover support via helpers template class 2022-02-22 00:05:43 +01:00
Peter Boyle
63dbaeefaa Extra barrier prior to finalize just in case it fixes an issue on Tursa 2022-02-16 14:01:43 +00:00
Peter Boyle
e8c187b323 SyCL happier? 2022-02-15 11:24:38 -05:00
Peter Boyle
0c1618197f Faster intranode MPI works now 2022-02-15 08:52:07 -05:00
Peter Boyle
f49d5c2d22 Updated scripts for crusher 2022-02-14 17:55:16 -05:00
Peter Boyle
a3b022d469 Crusher compile 2022-02-14 15:09:08 -05:00
Peter Boyle
48772f0976 Merge pull request #384 from jdmaia/hip_launchbounds
Changing thread block order and adding launch_bounds
2022-02-14 11:08:28 -05:00
Peter Boyle
c322420580 Dont instantiate an Nc=3 and non-GP hardwired code for other implementations 2022-02-14 16:04:08 +00:00
Julio Maia
86f4e17928 Changing thread block order and adding launch_bounds 2022-02-07 11:29:37 -06:00
Peter Boyle
215df671be Merge pull request #382 from DanielRichtmann/feature/compact-clover
Compact Clover Fermions
2022-02-01 21:45:38 -05:00
Daniel Richtmann
1b6b12589f Get splitting up into implementation and instantiation files correct 2022-02-02 00:51:11 +01:00
Daniel Richtmann
3082ab8252 Check in compact version of wilson clover fermions 2022-02-02 00:50:05 +01:00
Daniel Richtmann
add86cd7f4 Abandon ET for clover application, use construct similar to multLink 2022-02-01 23:09:06 +01:00
Daniel Richtmann
0b6fd20c54 Enable memory coalescing in clover term generation 2022-02-01 23:09:06 +01:00
Daniel Richtmann
e83423fee6 Refactor clover to align with other files and prepare for upcoming changes 2022-02-01 23:09:06 +01:00
Daniel Richtmann
b4f8e87982 Have Grid's cli interface understand floats 2022-02-01 23:09:06 +01:00
Peter Boyle
135808dcfa Less verbose 2021-12-07 16:24:24 -05:00
Peter Boyle
7f7d06d963 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-12-07 09:06:42 -08:00
Peter Boyle
2bf3b4d576 Update to reduce memory footpring in benchmark test 2021-12-07 09:02:02 -08:00
RJHudspith
0bd83cdbda Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled. 2021-11-28 21:51:03 +01:00
Peter Boyle
f34d34bd17 2 nodes 2021-11-22 22:27:16 -05:00
Peter Boyle
e32d5141b4 Updated to make MPI reliable still gives good perf, but MPI will be slow
intranode
2021-11-22 21:46:31 -05:00
Peter Boyle
6d5277f2d7 Update to Spock 2021-11-22 20:58:02 -05:00
Peter Boyle
14d82777e0 Best modules for spock 2021-11-22 20:47:16 -05:00
Peter Boyle
2a4e739513 Enable XGMI copy (need to rename nvlink to cover NVLINK/XGMI/XeLink) 2021-11-22 20:46:09 -05:00
Peter Boyle
8079dc2a14 Cray MPI not working right yet 2021-11-22 20:45:44 -05:00
Peter Boyle
6ceb556684 Intranode asynch hipMemCopy 2021-11-22 20:45:12 -05:00
Peter Boyle
76cde73705 HIP improvements on messaging and intranode hipMemCopyAsynch 2021-11-22 20:44:39 -05:00
Peter Boyle
cc094366a9 Merge pull request #375 from JPRichings/develop
Lattice object ACCcache probe
2021-11-09 18:19:32 -05:00
41a575ff9b Format edit 2021-11-09 21:56:23 +00:00
12ef413065 fix to deflation.h 2021-11-09 21:20:36 +00:00
829a328451 remove deflation timing 2021-11-09 20:46:57 +00:00
402523c62e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-11-09 12:57:40 +00:00
d7bef70b5c Helper functions to allow probe of cache state of lattice objects. 2021-11-09 12:57:09 +00:00
2ad1811642 Added timing to deflation code. 2021-11-09 12:33:25 +00:00
a65a497bae Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-29 13:01:34 +01:00
b27b12828e reverse previous "fix", missing statement was probably intentional, added a comment to that effect 2021-10-29 13:01:31 +01:00
Peter Boyle
42d56ea6b6 Verbosity 2021-10-29 02:23:08 +01:00
Peter Boyle
0b905a72dd Better reduction for GPUs 2021-10-29 02:22:22 +01:00
Peter Boyle
fe9edf8526 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-29 02:03:27 +01:00
Peter Boyle
44204c7e06 Extra code 2021-10-29 02:02:56 +01:00
Peter Boyle
33b3789598 Merge pull request #364 from AndrewYongZhenNing/develop
CayleyFermion5D Conserved current fix
2021-10-27 20:27:20 -04:00
Peter Boyle
195ab2888d Merge branch 'develop' into develop 2021-10-27 20:26:57 -04:00
Peter Boyle
85f750d753 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-27 00:28:05 +01:00
Peter Boyle
a4ce6e42c7 Warning free compile on make all and make tests under nvcc 2021-10-27 00:27:03 +01:00
Peter Boyle
5398b7e7e3 Max 128 size 2021-10-26 09:16:29 -07:00
fd13a3f2be Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-26 10:45:46 +01:00
c144b32368 deflation timers 2021-10-26 10:37:24 +01:00
Peter Boyle
ba7e371b90 Warning free compile on Tursa.
Hopefully got all reqd virtual dtors
2021-10-21 19:56:52 +01:00
Peter Boyle
99e7a5d18a Merge pull request #371 from edbennett/hmc-documentation-update
update documentation for GenericHMCRunner - thanks
2021-10-18 14:36:43 -04:00
f824d99059 update documentation for GenericHMCRunner 2021-10-18 09:50:16 +01:00
Peter Boyle
749b8022a4 Linear operator and SparseMatrix virtual destructors 2021-10-15 20:47:18 +01:00
Peter Boyle
7e0057d2c4 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-15 20:46:51 +01:00
Peter Boyle
cfe9e870d3 Stream 2021-10-15 20:46:44 +01:00
Peter Boyle
e9c4f06cbf Merge pull request #370 from fjosw/bugfix/gpu_sum_shm
Error Handling sum_Dgpu large objects
2021-10-14 09:12:47 -04:00
1f9688417a Error message added when attempting to sum object which is too large for
the shared memory
2021-10-13 20:45:46 +01:00
Peter Boyle
16c2a99965 Overlap cudamemcpy - didn't set up stream right 2021-10-11 13:31:26 -07:00
Peter Boyle
cda915a345 Better options 2021-10-07 20:29:09 +01:00
Peter Boyle
7c16189e16 Merge pull request #368 from Heinrich-BR/develop
Accelerated Pick-Set Checkerboard functions
2021-10-07 15:13:09 -04:00
Peter Boyle
ecbfccea43 Merge pull request #369 from paboyle/gauge-group-covariance
expose gauge group in GImpl and generic Nc fix
2021-10-07 15:11:12 -04:00
Peter Boyle
a8eda8f6da Summit scripts 2021-10-05 21:22:10 -04:00
Peter Boyle
9b1a0653cf Summit results 2021-10-05 21:22:01 -04:00
Peter Boyle
7cb1ff7395 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 20:13:42 -04:00
Peter Boyle
ab6ea29913 Print removal 2021-10-05 20:13:25 -04:00
b5c81a02b6 Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-05 21:13:01 +01:00
d899ee80fc skip record fixed to include norm metadata 2021-10-05 21:12:47 +01:00
Peter Boyle
4016e705fc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 14:56:57 -04:00
Peter Boyle
2f4e85e5d6 Summit set up 2021-10-05 14:56:17 -04:00
Peter Boyle
8ed0b57b09 Memory verbose and tracking, shrink default cache
Print PCI device IDs on node 0
2021-10-05 11:41:03 -04:00
a976fa6746 expose gauge group in GImpl and generic Nc fix 2021-10-05 14:19:47 +01:00
6c66b8d997 deflated guesser can optionally be used with less vectors than provided 2021-09-30 19:25:12 +01:00
9523ad3d73 vector version of Schur solver use vector guesser 2021-09-28 12:45:47 +01:00
73a95fa96f LinearFunction loops over vectors by default, can be overloaded 2021-09-28 12:44:26 +01:00
7e130076d6 Fixed line left behind 2021-09-24 17:26:31 +01:00
6efdad6f21 Removed Halo benchmark 2021-09-24 17:18:04 +01:00
a822c48565 Added accelerated pick-set checkerboard functions 2021-09-24 17:13:25 +01:00
014fb76e88 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-09-24 16:45:25 +01:00
30e5311b43 Update from the gods upstream 2021-09-24 16:39:56 +01:00
Peter Boyle
67e08aa952 New file not run yet 2021-09-23 23:39:55 +02:00
Peter Boyle
ed1f20f3a1 Merge pull request #367 from mmphys/bugfix/H5NS
Hdf5 namespace
2021-09-23 12:36:11 -04:00
Peter Boyle
cffc736bb3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-22 06:03:06 -07:00
Peter Boyle
c0d56a1c04 Perlmutter tune up 2021-09-22 06:02:34 -07:00
Peter Boyle
3206f69478 SYCL happy 2021-09-21 18:01:35 -07:00
Peter Boyle
b2ccaad761 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 12:18:05 -07:00
Peter Boyle
8eb1232683 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 09:25:07 -07:00
Peter Boyle
c6ce3ad03b Some properties 2021-09-21 09:20:21 -07:00
Peter Boyle
b3b033d343 Clean 2021-09-21 09:18:54 -07:00
Peter Boyle
ca9816bfbb Typo 2021-09-21 04:12:04 +02:00
Peter Boyle
814d5abc7e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 04:05:51 +02:00
Peter Boyle
a29122e2bf Rebench 2021-09-21 04:05:04 +02:00
Peter Boyle
e188c0512e Udpdate 2021-09-21 01:04:30 +02:00
Peter Boyle
1fb6aaf150 Device 2 Device with cudaMemcpy 2021-09-21 01:03:07 +02:00
Peter Boyle
894654f7ef Simplificatoin, always gather faces 2021-09-21 01:02:34 +02:00
Peter Boyle
109507888b Option to force use of MPI over Nvlink 2021-09-21 00:53:25 +02:00
Peter Boyle
68650b61fe Options controlling behaviour 2021-09-21 00:51:01 +02:00
Michael Marshall
7ee66bf453 Make sure H5NS has empty definition if HDF5 built without C++ namespace. Add comment in Hdf5IO.cc indicating likely source of error using H5NS, i.e. lack of --enable-cxx in hdf5 configure. 2021-09-19 19:45:20 +01:00
Peter Boyle
8bd70ad8b5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-16 10:22:38 -07:00
Peter Boyle
af98525766 Merge pull request #359 from paboyle/feature/serialisation-update
Feature/serialisation update
2021-09-16 10:24:52 -04:00
Peter Boyle
1c2f218519 Merge pull request #360 from pjgeorg/ld-nvcc-openmp
nvcc: Add -fopenmp to LDFLAGS
2021-09-16 10:24:30 -04:00
Peter Boyle
c9aa1f507c Merge pull request #363 from felixerben/feature/testMesonField
Feature/test meson field
2021-09-16 10:23:58 -04:00
Peter Boyle
ea7126496d Merge pull request #361 from edbennett/fix-setdevice-message
make message about setdevice consistent with configure script
2021-09-16 10:23:37 -04:00
Peter Boyle
f660dc67e4 Merge pull request #366 from lehner/feature/gpt
Avx512 mixed prec
2021-09-15 20:27:13 -04:00
Christoph Lehner
ede8faea74 Merge branch 'paboyle:develop' into feature/gpt 2021-09-16 02:23:15 +02:00
Christoph Lehner
1b750761c2 Merge pull request #26 from waterret/feature/gpt
AVX512 drop mixed precision as well
2021-09-16 02:22:52 +02:00
Peter Boyle
145acf2919 Perf results 2021-09-16 01:06:28 +01:00
Peter Boyle
cc4a27b9e6 Scripts and performance 2021-09-16 00:15:35 +01:00
Peter Boyle
b4690e6091 Adding build basics for different systems 2021-09-16 00:00:38 +01:00
Luchang Jin
4b24800132 AVX512 drop mixed precision as well 2021-09-15 16:29:47 -04:00
Peter Boyle
9d2238148c Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-09-15 19:25:57 +01:00
Peter Boyle
c15493218d Two extra routines to break out SchurRedBlack on many RHS into stages to allow efficient deflation & split grid
Split grid solver still to do.
2021-09-15 19:24:39 +01:00
Peter Boyle
001a556a34 Merge pull request #365 from lehner/feature/gpt
Sync
2021-09-15 13:34:02 -04:00
Christoph Lehner
3d0f88e702 A64FX drop mixed precision as well 2021-09-15 18:38:32 +02:00
Christoph Lehner
dd091d0960 consistent pointer offloading instead of views 2021-09-15 16:58:05 +02:00
Christoph Lehner
e2abbf9520 Merge pull request #25 from paboyle/develop
Sync
2021-09-15 10:02:43 +02:00
Peter Boyle
c7baeb5bae Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-14 08:31:11 -07:00
Peter Boyle
402d80e197 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-09-14 16:16:06 +01:00
Peter Boyle
86e33c8ab2 Significant GPU perf speed up finished 2021-09-14 16:14:23 +01:00
Peter Boyle
5dae6a6dac Deprecate half prec comms 2021-09-14 15:06:59 +01:00
Peter Boyle
361bb8a101 Remove half prec comms 2021-09-14 15:06:29 +01:00
Peter Boyle
7efdb3cd2b Remove half prec comms 2021-09-14 15:06:06 +01:00
Peter Boyle
65ef4ec29f Move tables to device memory 2021-09-14 15:05:01 +01:00
Peter Boyle
d5835c0222 Switch to coalesced stencil face gather 2021-09-14 15:04:14 +01:00
Peter Boyle
a7b943b33e Remove half prec comms 2021-09-14 05:05:33 +01:00
Peter Boyle
7440cde92f No half prec comms; coalesced access on GPU 2021-09-14 05:04:56 +01:00
Peter Boyle
0fc662bb24 Dirac cuda 11.4 happy ; force host for functions accessing mult table
ET runs these on host BEFORE lodging result in AST for kernel
2021-09-14 05:00:44 +01:00
Peter Boyle
8195890640 Force MPI over NVLINK 2021-09-14 05:00:17 +01:00
Peter Boyle
4c88104a73 Fix compile warns 2021-09-11 23:08:05 +01:00
Peter Boyle
73b944c152 Drop half prec comms for now. 2021-09-11 23:07:18 +01:00
Peter Boyle
d1b0b7f5c6 Half prec comms dropping 2021-09-11 23:05:40 +01:00
Peter Boyle
381d8797d0 Drop half prec comms for now 2021-09-11 23:05:02 +01:00
11ee8a1061 Merge remote-tracking branch 'upstream/develop' into develop 2021-09-02 16:57:42 +01:00
Peter Boyle
b06526bc1e Comment update 2021-08-30 21:15:39 -04:00
Peter Boyle
3044419111 Some sample code 2021-08-30 20:32:11 -04:00
Peter Boyle
bcfa9cf068 Improvement of output 2021-08-28 08:08:15 -07:00
Peter Boyle
114920b8de Some example clean up 2021-08-25 12:24:17 +01:00
Peter Boyle
0d588b95f4 Bug fix to Example_Laplacian test 2021-08-23 23:14:26 +01:00
Peter Boyle
5b3c530aa7 Return value 2021-08-23 15:30:45 +01:00
Peter Boyle
c6a5499c8b Fail on non-apple 2021-08-22 18:40:55 +01:00
Peter Boyle
ec9c3fe77a Remove the file 2021-08-22 18:28:39 +01:00
Peter Boyle
6135ad530e Extra examples / solutions 2021-08-22 18:25:07 +01:00
Peter Boyle
40098424c7 Examples 2021-08-22 14:17:12 +01:00
Peter Boyle
7163b31a26 Examples 2021-08-20 01:15:23 +01:00
Peter Boyle
ffbdd91e0e Apple happiness 2021-08-20 01:15:00 +01:00
Peter Boyle
5d29e175d8 Typo fix 2021-08-10 18:25:43 +01:00
Peter Boyle
417dbfa257 Fix 2021-08-10 08:55:35 -07:00
peterx.a.boyle
1eda4d8e0b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-08-10 05:41:18 -07:00
peterx.a.boyle
50181f16e5 Level 0 IPC set up 2021-08-10 05:35:15 -07:00
Peter Boyle
75030637cc Improved comms benchmark, same as benchmark_comms_host_device 2021-08-10 05:16:30 -07:00
Peter Boyle
fe5aaf7677 Make comms benchmark same as Benchmark_comms_host_device 2021-08-09 04:06:30 -07:00
Peter Boyle
80ac2a73ca Check is wrong (HtoD / DtoH) 2021-08-05 18:33:20 -04:00
Andrew Yong
770680669d Whitespace removal. 2021-08-04 09:21:59 +01:00
Andrew Yong
0cdfc5cf22 Merge remote-tracking branch 'upstream/develop' into develop 2021-07-30 14:40:55 +01:00
d75a66a3e6 test done 2021-07-06 11:42:36 +01:00
fcc4374d7b i/o done 2021-07-05 14:52:00 +01:00
67c3c16fe5 working test 2021-07-05 14:41:52 +01:00
25e9be50b5 created test file 2021-07-02 15:51:19 +01:00
428b8ba907 Updated from upstream and added halo benchmark 2021-06-29 01:05:12 +01:00
323cf6c038 make message consistent with configure script 2021-06-23 17:00:43 +01:00
Peter Boyle
29a22ae603 Simpler SYCL setup 2021-06-22 17:57:20 +00:00
Peter Boyle
403bff1a47 Force reqd subgroup size fo SYCL 2021-06-22 17:56:10 +00:00
Christoph Lehner
c50f27e68b Make FFT play nice with split grid 2021-06-20 11:34:38 +02:00
Peter Georg
80afacec5b nvcc: Add -fopenmp to LDFLAGS 2021-06-17 13:05:13 +02:00
Peter Boyle
6cd9224dd7 SYCL comms buffer allocate 2021-06-16 17:10:55 +00:00
Peter Boyle
4bf8196ff1 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-06-15 21:45:36 +00:00
Peter Boyle
4c5440fb06 const happy for sycl 2021-06-15 21:45:07 +00:00
a269a3d919 Merge pull request #358 from mmphys/feature/serialisation-test
Add a ragged std::vector to the serialisation test
2021-06-09 10:16:25 +01:00
Michael Marshall
0c4f585496 Test nested std::vector<grid tensor> 2021-06-08 00:05:35 +01:00
Michael Marshall
33d2df46a0 Merge branch 'develop' into feature/serialisation-test
* develop:
  Update README.md
  removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)
2021-06-07 23:25:38 +01:00
Michael Marshall
2df308f649 Add a ragged vector to the serialisation tests. NB: Already had nested (regular) std::vector<std::vector<...>> 2021-06-07 23:25:07 +01:00
Peter Boyle
92def28bd3 Update README.md 2021-06-06 04:52:05 -04:00
ca10bfa1c7 removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore) 2021-06-04 11:12:22 +01:00
298a6ec51e Merge pull request #357 from mmphys/bugfix/ragged
Bugfix/ragged Multi-dimensional ragged vectors
2021-06-04 10:34:46 +01:00
Michael Marshall
e5dbe488a6 Merge branch 'develop' into bugfix/ragged
* develop:
  Remove synch
2021-06-03 08:25:56 +01:00
Peter Boyle
0e27e3847d Remove synch 2021-06-03 04:24:19 +00:00
Michael Marshall
393727b93b Documentation update (briefly) covering serialisation changes. For review 2021-06-01 15:49:37 +01:00
Michael Marshall
2b1fcd78c3 Fixes post review with Peter: a) Correct bug in isRegularShape - detect 3d matrix where 1st slice is 2x2 and second slice is 2x1; b) Synchronisation of EigenResizeCounter done by checking we're the OMP primary thread; c) Move definition of EigenResizeCounter to new file, BaseIO.cc 2021-05-31 22:24:54 +01:00
Michael Marshall
0a4e0b49a0 BaseIO: Added "EigenResizeCounter" to keep track of any allocations/deallocations to Eigen tensors during readback. On read, if the tensor is resized, EigenResizeCounter += delta memory (in bytes) 2021-05-31 12:49:56 +01:00
Michael Marshall
76af169f05 Add global namespace to Writer<T> and Reader<T> inside GRID_SERIALIZABLE_CLASS_MEMBERS (so that "using Grid" not necessary).
Fix issue with output of Grid::iMatrix so that M<3>{{148,149,150,} {151,152,153,} {154155156}} becomes M<3>{{148,149,150} {151,152,153} {154,155,156}}
2021-05-31 08:43:02 +01:00
Michael Marshall
7b89232251 Extended HDF5 serialisation of std::vector<T> where T now also includes Grid scalar/vector/matrix
Changed VectorUtils element traits to is_flattenable, because: a) contract changed on what it does; and b) no other Grid dependencies on element. Needs review.
Initial tests work ... needs proper regression testing.
2021-05-30 20:27:53 +01:00
Peter Boyle
b5aeae526f Make Cshift fields static to avoid repeated reallocaate overhead 2021-05-28 16:33:08 +02:00
Michael Marshall
ef0ddd5d04 std::vector serialisation in hdf5 uses a different format if the vector is ragged. When reading back std::vector we need to check which format we're reading (since we don't know a priori) and this involves looking for attributes that may not exist. The c++ API: a) throws; and b) prints voluminous logging. Switched to non-throwing, non-logging, C version of the API after code review. 2021-05-24 18:43:55 +01:00
Michael Marshall
9b73dacf50 First row might still be ragged if multi dimensional. attrExists() doesn't throw, but easier to wrap in try ... catch than to explain in comment. 2021-05-22 04:34:32 +01:00
Michael Marshall
244b4aa07f Serialise std::vector of numeric types as multidimensional object if size is regular ... or individually if ragged 2021-05-21 20:08:56 +01:00
u61464
8cfc7342cd staggered hand unroll read coalesce 2021-05-05 14:17:18 -07:00
u61464
15ae317858 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-05-04 08:40:38 -07:00
u61464
834f536b5f Fastest option on SyCL is now std::complex 2021-05-04 08:40:18 -07:00
Peter Boyle
c332d9f08b Merge pull request #356 from felixerben/bugfix/stoutSmearing
Jamie's fix
2021-04-27 14:10:49 -04:00
cf2923d5dd Jamie's fix 2021-04-27 16:53:37 +01:00
Peter Boyle
0e4413ddde Merge pull request #355 from felixerben/bugfix/stoutSmearing
bugfix 3D stout smearing
2021-04-27 08:01:55 -04:00
009ccd581e bugfix 3D stout smearing 2021-04-26 10:36:33 +01:00
Peter Boyle
8cd4263974 Tests compile 2021-04-25 22:20:37 -04:00
Peter Boyle
d45c868656 Change interface 2021-04-25 10:53:34 -04:00
Peter Boyle
955a8113de Expose label only to reduce number of parameters 2021-04-25 10:36:38 -04:00
Peter Boyle
dbe210dd53 Open the ens_id 2021-04-25 10:25:59 -04:00
54c6b1376d Quick fix of conserved current implementation in CayleyFermion5D. Now function treats current insertion with appropriate periodic boundary conditions in the mu=3 direction. 2021-04-21 16:56:46 +01:00
Peter Boyle
86e11743ca set twists 2021-04-20 10:19:11 -04:00
f3f11b586f Tadpole sign now in front of forward hopping term to be consistent with previous implementation and analytic form. 2021-04-17 12:44:27 +01:00
8083e3f7e8 Sign factor for tadpole implementation corrected. 2021-04-15 11:14:31 +01:00
Peter Boyle
980e721f6e Update MetaData.h 2021-04-13 09:33:01 -04:00
364793154b Reverted checkerboard changes 2021-04-09 15:47:17 +01:00
3e2ae1e9af Added profiling messages to pick and set checkerboard functions 2021-04-08 16:58:47 +01:00
Henrique Rocha
d38ae2fd18 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-06 17:18:39 +01:00
Henrique Rocha
030e7754e4 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-06 17:16:13 +01:00
Peter Boyle
e2a0142d87 Merge pull request #348 from AndrewYongZhenNing/develop
Conserved Tadpole Implementation for Shamir Action Only
2021-04-06 10:49:00 -04:00
895244ecc3 Merge with upstream; implemented conserved tadpole for Shamir action. 2021-04-06 13:46:33 +01:00
addeb621a7 Implemented tadpole operator for Shamir action. 2021-04-06 13:45:37 +01:00
3b7fce1e76 Reverted checkerboard changes 2021-04-02 14:38:41 +01:00
4d15417f93 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-01 18:28:15 +01:00
ab3c855f65 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-01 18:22:05 +01:00
92e2c517d8 Changed pick- and setCheckerboard to use accelerator_for 2021-04-01 18:21:19 +01:00
Peter Boyle
a7fb25adf6 Make Cshift fields static to avoid repeated reallocaate overhead 2021-03-29 21:44:14 +02:00
Peter Boyle
e947992957 Improved force terms 2021-03-29 20:04:06 +02:00
Peter Boyle
bb89a82a07 Staggered coalseced read 2021-03-29 20:01:15 +02:00
Christoph Lehner
2bb374daea hip-friendly 2021-03-19 11:33:23 +01:00
Peter Boyle
8bdadbadac Cold start 2021-03-18 15:41:14 -04:00
Peter Boyle
15c50a7442 Explicit instantiate the template function 2021-03-18 15:40:42 -04:00
Peter Boyle
49b0af2c95 Update of tests to compile with the sRNG addition.
Audited the code conventions (again) with the CPS momentum denominator
and added anti periodic in time to the Test_mobius_force.cc and
tested the Test_dwf_gpforce.

Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi

with the same pdot and Udot as audited in the Integrator.h etc...

With full comments and sources for factors.
2021-03-18 09:10:02 -04:00
Peter Boyle
9c2b37218a sRNG parameter added 2021-03-18 06:24:11 -04:00
Peter Boyle
3c67d626ba Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 15:36:55 +01:00
Peter Boyle
51f506553c Read out the local ID once, and store 2021-03-12 15:33:04 +01:00
Peter Boyle
226be84937 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 09:31:50 -05:00
Peter Boyle
001814b442 updated to do list. Start adding DDHMC work items 2021-03-12 09:31:17 -05:00
Peter Boyle
db3ac67506 Update thread issue 2021-03-12 14:55:07 +01:00
Peter Boyle
da91a884ef NVCC versions found buggy added as guard 2021-03-11 23:54:53 +01:00
Peter Boyle
a71e6755e3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-11 22:43:06 +01:00
Peter Boyle
cd5891eecd Test that fails on Cuda 11.0 2021-03-11 22:34:28 +01:00
Peter Boyle
5bb7336f27 Merge pull request #347 from pjgeorg/fix-autotools-avx512
Fix inconsistent SIMD option AVX512

Thanks
2021-03-11 16:29:07 -05:00
Peter Boyle
ce1fc1f48a Possible fallback plan for Fionn's compiler bbug in nvcc 2021-03-11 22:20:53 +01:00
Peter Georg
82402c6a7c Add simd option SKL for ICC 2021-03-11 13:08:40 +01:00
Peter Georg
d9c4afe5b7 Fix inconsistent configure option AVX512
Before this change AVX512 enabled different instruction sets depending
on the compiler:

For Intel C++ Compiler Classic (ICC):
    AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL
    i.e. Intel Xeon Skylake and newer

For Intel ICX, gcc, clang:
    AVX512F, AVX512CD, AVX512ER, AVX512PF
    i.e. Intel Xeon Phi x200/x205 (KNL/KNM)

With this commit AVX512 now only enables the common instruction sets
supported by all CPUs supporting any AVX-512 instructions set:
AVX512F and AVX512CD (called COMMON-AVX512 by icc)
2021-03-11 12:58:49 +01:00
Peter Boyle
f786ff8d69 Extend test from Fionn, fails on A100 apparently 2021-03-10 14:32:06 -05:00
u61464
a651caed5f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-10 06:23:51 -08:00
u61464
0e21adb3f6 Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly.
Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp
evolves a central nervous systems.
2021-03-10 05:40:51 -08:00
Peter Boyle
58bf9b9e6d Clean up test 2021-03-10 02:45:22 +01:00
Peter Boyle
2146eebb65 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-09 04:31:46 +01:00
Peter Boyle
6a429ee6d3 2d loop hits Nvidia 16bit limit on large local vols 2021-03-09 04:31:10 +01:00
Peter Boyle
4d1ea15c79 More verbosity. The 16bit limit on Grid.y, Grid.z is annoying 2021-03-09 04:29:37 +01:00
Peter Boyle
a76cb005e0 Update Tensor_exp.h 2021-03-08 13:37:57 -05:00
Christoph Lehner
49ecbc81d4 Merge pull request #24 from ThomasWurm/feature/gpt
Put GlobalSum outside the slice loop in sliceSum
2021-03-08 16:01:47 +01:00
Thomas Wurm
9e5fb52eb9 Put GlobalSum outside the slice loop 2021-03-08 13:53:34 +01:00
Peter Boyle
a9604367c1 Merge pull request #336 from lehner/feature/gpt
Make ShmDims configurable; adjust GRID_MAX_SIMD to allow for 128 byte width on GPUs
2021-03-05 13:17:19 -05:00
Peter Boyle
d7065023cc Merge pull request #332 from mmphys/feature/mres_schur
Optional changes to Test_cayley_mres e.g. Schur solver
2021-03-05 12:47:07 -05:00
Peter Boyle
89d299ceec Merge pull request #333 from mmphys/bugfix/LatTransfer
Fix convertType for GPU in Lattice_transfer.h
2021-03-05 12:46:33 -05:00
Peter Boyle
e34eda66df Merge pull request #344 from felixerben/feature/XiToSigma
Feature/xi to sigma
2021-03-05 12:45:44 -05:00
Christoph Lehner
b24181aa4f Update Coordinate.h
Revert GRID_MAX_SIMD change
2021-03-05 16:56:58 +01:00
Peter Boyle
aa173e2998 Update README.md 2021-03-05 10:25:33 -05:00
7a19432e0b whitespace 2021-03-05 10:57:09 +00:00
9b15704290 tested and consitent 2021-03-05 10:42:32 +00:00
Michael Marshall
017f955b2d Merge branch 'develop' into feature/mres_schur
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:42:02 +00:00
Michael Marshall
f252d69eef Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:41:30 +00:00
3b06e4655e Merge branch 'develop' into feature/XiToSigma 2021-03-04 20:06:16 +00:00
d4b4de8f42 changes 2021-03-04 20:01:24 +00:00
Peter Boyle
c90beee774 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-03 23:50:29 +01:00
Peter Boyle
1eea9d73b9 Pass serial RNG around 2021-03-03 23:50:01 +01:00
u61464
679d1d22f7 Sycl happier 2021-03-03 11:21:43 -08:00
Michael Marshall
b2b5e0b98c Merge branch 'develop' into feature/mres_schur
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
  Better SIMD usage/coalescence
2021-03-03 16:15:12 +00:00
Michael Marshall
03e54722c1 Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
2021-03-03 16:13:23 +00:00
Peter Boyle
442336bd96 Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case.
Other cases to do. This now includes comms code path.
2021-03-02 14:50:51 +01:00
Christoph Lehner
9c9566b9c9 Merge pull request #23 from paboyle/develop
Sync
2021-03-01 12:33:51 +01:00
Michael Marshall
1059a81a3c Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Better SIMD usage/coalescence
2021-02-27 00:21:36 +00:00
Peter Boyle
2e61556389 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-02-26 17:52:20 +01:00
Peter Boyle
f9b1f240f6 Better SIMD usage/coalescence 2021-02-26 17:51:41 +01:00
Michael Marshall
69f41469dd Merge branch 'develop' into bugfix/LatTransfer
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-25 09:19:17 +00:00
Michael Marshall
d620b303ff Merge branch 'develop' into feature/mres_schur
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-24 18:07:27 +00:00
Peter Boyle
157fd1428d Merge pull request #342 from paboyle/feature/link-update-mask
Feature/link update mask
2021-02-24 11:29:52 -05:00
Christopher Kelly
c791cb2214 Merge branch 'develop' into feature/link-update-mask 2021-02-23 11:51:54 -05:00
Christopher Kelly
d5ab571a89 Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces
Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC
Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
2021-02-23 11:49:56 -05:00
0ed800f6e4 merge develop 2021-02-23 14:54:46 +00:00
Peter Boyle
0a32183825 Merge pull request #335 from felixerben/gpu/baryons
Gpu/baryons
2021-02-23 09:30:16 -05:00
Peter Boyle
2cacfbde2a Merge pull request #341 from DanielRichtmann/fix/minor-things
Minor fixes
2021-02-22 09:28:50 -05:00
Daniel Richtmann
c073e62e0b Correct misleading ac help string 2021-02-22 15:25:44 +01:00
Daniel Richtmann
e3d019bc2f Enable performance counting in WilsonFermion like in others 2021-02-22 15:25:40 +01:00
7ae030f585 changed back A2AUtils warning 2021-02-18 13:24:50 +00:00
86b58d5aff changed if and accelerator_for - no runtime errors any more 2021-02-18 12:04:32 +00:00
Peter Boyle
26e8b9f4a5 Merge pull request #340 from mmphys/bugfix/config
Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
2021-02-17 11:56:21 -05:00
Michael Marshall
35114c9e62 Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu 2021-02-17 13:24:15 +00:00
Peter Boyle
dfd28a85c9 Merge pull request #339 from mmphys/bugfix/config
Optional rename PACKAGE_ to GRID_ in Grid/Config.h
2021-02-15 13:53:26 -05:00
Michael Marshall
a503332924 Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working.
Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
2021-02-14 21:27:54 +00:00
Peter Boyle
1ac13ec3a7 Merge pull request #338 from paboyle/bugfix/maxnorm2
Fixed compile issues with maxLocalNorm2 for non-scalar lattices
2021-02-08 12:08:11 -05:00
Christopher Kelly
55de69a569 Fixed compile issues with maxLocalNorm2 for non-scalar lattices
maxLocalNorm2 test now reuses the random field
2021-02-08 12:03:16 -05:00
Peter Boyle
eda9ab487b MADWF 5d source option for hadrons - look at Grid of source
Abort on GPU error
2021-02-08 10:47:22 -05:00
Peter Boyle
cd99edcc5f maxLocalNorm2() 2021-02-04 18:25:49 -05:00
Christoph Lehner
4705aa541d Allow user to configure ShmDims via environment variables 2021-02-04 14:25:55 +01:00
Michael Marshall
3215d88a91 Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if 2021-02-03 15:17:03 +00:00
9b9a53f870 ... 2021-02-02 13:06:43 +00:00
Christoph Lehner
019ffe17d4 Allow for GPU vector width beyond 64 2021-02-02 11:32:23 +01:00
bc496dd844 change back benchmark_ITT 2021-01-28 14:29:56 +00:00
a673b6a54d prettify 2021-01-28 14:15:09 +00:00
1bf2e4d187 Merge branch 'develop' into gpu/baryons 2021-01-27 21:17:37 +00:00
Peter Boyle
96dd7a8fbd Flop cout matches DiRAC-ITT-2020 2021-01-27 21:14:52 +00:00
7905afa9f5 revert changes 2021-01-27 21:14:52 +00:00
712bb40650 merge develop 2021-01-27 21:14:52 +00:00
81d88d9f4d fixes 2021-01-27 21:09:51 +00:00
Michael Marshall
77063418da Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex<T>. This removes many warnings in Hadrons
Simplify the SFINAE syntax and correct convertType for iScalar
2021-01-25 15:09:36 +00:00
Michael Marshall
2983b6fdf6 Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided 2021-01-23 12:41:48 +00:00
Peter Boyle
69f1f04f74 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-01-21 21:39:59 -05:00
Peter Boyle
11a5fd09d6 Hot config 2021-01-21 21:39:41 -05:00
Peter Boyle
ff1fa98808 Fix for GPU conserveed current 2021-01-21 21:38:23 -05:00
df16202865 weird bug in 2pt function... 2021-01-19 19:25:27 +00:00
3ff7c2c02a Merge branch 'develop' into gpu/baryons 2021-01-19 12:34:13 +00:00
fc6d07897f revert changes 2021-01-19 12:32:48 +00:00
f9c8e5c8ef Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-01-19 12:30:29 +00:00
8bfa0e74f8 final version, tested on CPU and GPU 2021-01-19 12:27:57 +00:00
9b73a937e7 bugfix 2021-01-18 18:57:05 +00:00
Peter Boyle
b0339bc5a4 Merge branch 'feature/conjugate-bc-dirs' into develop 2021-01-15 09:28:39 -05:00
Peter Boyle
3c23a947cc Fixed test for very much non-unit det 2021-01-15 09:16:02 -05:00
Peter Boyle
56111bb823 Merge branch 'develop' into feature/conjugate-bc-dirs 2021-01-14 21:01:22 -05:00
Peter Boyle
99445673f6 Gparity fix, and plaquette IO 2021-01-14 21:00:36 -05:00
Peter Boyle
97a59643f7 Red black coarse space 2021-01-14 20:49:13 -05:00
Peter Boyle
579595f547 Red black on coarse space 2021-01-14 20:48:35 -05:00
Peter Boyle
281ac5fc12 Red black support on coars 2021-01-14 20:48:08 -05:00
Peter Boyle
d8fa903b02 G5 on coarse spaces 2021-01-14 20:47:28 -05:00
Peter Boyle
eaff0f3aeb Gamma5 on coaree spaces 2021-01-14 20:46:58 -05:00
Peter Boyle
e8e20c01b2 Coarsened vector test 2021-01-14 20:46:21 -05:00
Peter Boyle
a4afc3ea2a Red black coarse space 2021-01-14 20:44:16 -05:00
fa12b9a329 bugfix 2021-01-13 10:04:17 +00:00
45fc7ded3a test for sum 2021-01-12 09:10:37 +00:00
74de2d9742 whitespace changes 2021-01-08 18:28:36 +00:00
e759367d42 tested and working 2021-01-08 18:04:50 +00:00
Christoph Lehner
299d0de066 Merge pull request #21 from paboyle/develop
Sync
2020-12-22 20:59:15 +01:00
Peter Boyle
3fe75bc7cb Merge pull request #329 from nmeyer-ur/feature/a64fx-3
Revised dslash/dwf kernels for A64FX
2020-12-20 08:17:15 -05:00
Nils Meyer
45d49d8648 clean up 2020-12-19 03:35:18 +01:00
Nils Meyer
6013183361 removed Asm impls 2020-12-19 03:25:01 +01:00
Nils Meyer
4b882e8056 fixed lost bracket 2020-12-19 03:09:20 +01:00
Nils Meyer
3f9ae6e7e7 Merge branch 'develop' into feature/a64fx-3 2020-12-19 02:37:11 +01:00
Nils Meyer
909acd55cd vnum variant for prefetches 2020-12-19 02:00:22 +01:00
Nils Meyer
4dd9e39e0d up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 2020-12-19 00:54:31 +01:00
Christoph Lehner
b4c1317ab4 Merge pull request #22 from DanielRichtmann/feature/clover-access-specifier
Clover access specifier
2020-12-18 16:20:19 +01:00
f36d6f3923 compiles on GPU. 3pt still wrong!!!! 2020-12-17 17:04:08 +00:00
Peter Boyle
7adb253e25 Merge pull request #328 from mmphys/feature/mrespatch
Enable existing conserved current code for CUDA
2020-12-17 11:10:29 -05:00
808f1e0e8c merge develop 2020-12-15 16:33:29 +00:00
Michael Marshall
873519e960 Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration 2020-12-14 16:06:10 +00:00
Peter Boyle
9aec4a3c26 SYCL 2020-12-10 02:11:17 -08:00
Daniel Richtmann
c438118fd7 Change access specifier of clover fields in order to allow deriving classes to access these 2020-12-08 14:42:11 +01:00
Peter Boyle
70510d151b Merge pull request #327 from paboyle/feature/gparity_twist_GPU
Feature/gparity twist gpu
2020-12-07 12:02:20 -05:00
Christopher Kelly
9e7bacb5a4 Merge branch 'develop' into feature/gparity_twist_GPU 2020-12-07 11:55:39 -05:00
Christopher Kelly
2ef1fa66a8 Improved performance of G-parity kernel for GPUs by simplifying multLink implementation 2020-12-07 11:53:35 -05:00
Peter Boyle
cf76741ec6 Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 ) 2020-12-03 03:47:11 -08:00
Peter Boyle
497e7c1c40 Duplicate code 2020-12-02 17:55:30 -08:00
Peter Boyle
888eacd3b8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-24 21:46:33 -05:00
Peter Boyle
321f0f51b5 Project to SU(N) 2020-11-24 21:46:10 -05:00
Christoph Lehner
17ec9c5545 Merge pull request #20 from paboyle/develop
Sync
2020-11-24 12:20:43 +01:00
Peter Boyle
30ad9578a2 Merge branch 'lehner-feature/gpt' into develop 2020-11-24 06:10:24 -05:00
Peter Boyle
9dce101586 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into lehner-feature/gpt 2020-11-24 06:10:16 -05:00
Peter Boyle
97e264d0ff Christoph's changes 2020-11-23 15:46:11 +00:00
Peter Boyle
683a5e5bf5 Stencil use host vector for integera table on enable-shared=no and mirror it on device 2020-11-23 15:39:51 +00:00
Peter Boyle
d4861a362c Stencil use non-UVM memory for look up table on enable-shared=no 2020-11-23 15:38:49 +00:00
Peter Boyle
5ff3eae027 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-20 13:14:44 -05:00
Peter Boyle
147dc15d26 Update 2020-11-20 13:13:59 -05:00
Christoph Lehner
c61ea72949 Merge pull request #19 from paboyle/develop
Sync
2020-11-20 17:31:13 +01:00
Peter Boyle
86e8b9fe38 ALLOC_ALIGN removed 2020-11-20 17:07:16 +01:00
Peter Boyle
612e468889 Configurable ALLOC_ALIGN and ALLOC_CACHE 2020-11-20 16:48:28 +01:00
Christoph Lehner
4ea8d128c2 Merge pull request #18 from paboyle/develop
Sync
2020-11-20 15:36:50 +01:00
Peter Boyle
e49b7f2f88 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-19 19:24:41 +01:00
Peter Boyle
aace3d47b9 partial work in progress 2020-11-19 19:24:14 +01:00
Peter Boyle
d5049949a4 Starting to fix reunitarise 2020-11-19 19:23:41 +01:00
Peter Boyle
f1c7480e3c Warning remove 2020-11-19 19:23:03 +01:00
Peter Boyle
5adae5d6ff Unused variable remove 2020-11-19 19:22:12 +01:00
Peter Boyle
a8412ace05 Merge pull request #317 from i-kanamori/develop
adding an error check for input: Parameters.StartingType
2020-11-18 23:09:40 -05:00
Peter Boyle
9fd1c2ad4b Merge pull request #325 from DanielRichtmann/feature/threaded-clover-inversion
Threaded clover term inversion
2020-11-18 23:08:37 -05:00
Peter Boyle
4cf3575353 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-18 03:07:36 +00:00
Peter Boyle
804a810d68 Wildcard mismatch 2020-11-18 03:06:53 +00:00
Peter Boyle
8fcb392e24 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-17 04:51:31 -08:00
Peter Boyle
dd8d70eeff Build without LIME 2020-11-17 04:41:15 -08:00
Peter Boyle
aa8aba6543 --shm-force-mpi 2020-11-16 20:15:50 -05:00
Peter Boyle
13df14f96e Switch off SHM paths with --disable-shm 2020-11-16 18:07:15 -05:00
Peter Boyle
3aab983760 Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) 2020-11-16 17:13:58 +01:00
Peter Boyle
9c4dcc5ea3 Merge branch 'master' into develop 2020-11-16 16:34:57 +01:00
Peter Boyle
a1063ddbb9 Update options and simplify 2020-11-13 04:11:03 +01:00
Peter Boyle
18ef8056ec Hide Shared Memory 2020-11-13 04:10:40 +01:00
Peter Boyle
1c673977fa Must ask for COMMMS_THREADS 2020-11-13 03:59:36 +01:00
Peter Boyle
e9bc748828 Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel 2020-11-13 03:58:34 +01:00
Peter Boyle
f48156529b Work on 2,2,2,8 ranks 2020-11-13 03:57:58 +01:00
Peter Boyle
d05ce01809 TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED 2020-11-13 03:52:19 +01:00
Peter Boyle
cf23eff60e Device to Device, Memset, cannot assume UVM == Communicable 2020-11-13 03:51:08 +01:00
Peter Boyle
6e313575be Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac 2020-11-13 03:50:16 +01:00
Peter Boyle
b13d1f7238 TOFU compat flag to help Isaaku 2020-11-13 03:49:44 +01:00
Peter Boyle
b5e7945dd9 Option for host or device Cshift implementation 2020-11-13 01:38:54 +01:00
Peter Boyle
7535566f54 Option for bounce through the SHM buffer 2020-11-12 22:54:27 +01:00
Peter Boyle
50b808ab33 Configure option between host and device 2020-11-12 22:28:12 +01:00
Peter Boyle
f16c2665f5 Host memory explict 2020-11-12 20:29:58 +01:00
Peter Boyle
41e28015ae Volume divisible guarantee 2020-11-07 13:32:16 +01:00
3594ce877b speedup in Sigma-to-nucleon 2020-11-03 20:04:30 +00:00
9bae6b889a speedup in Sigma-to-nucleon 2020-11-03 20:03:09 +00:00
4014dfd5b9 first tested version 2020-11-03 16:13:08 +00:00
67023c334b bugfix 2020-11-03 13:07:37 +00:00
a3de7026c8 bugfix 2020-11-03 12:51:50 +00:00
ee11678b1f added Xi-to-Sigma rare decays 2020-11-03 12:41:35 +00:00
Peter Boyle
a0ccbb3bd6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-01 01:16:35 +00:00
Peter Boyle
5eeabaa2bb HIP fix 2020-11-01 01:16:01 +00:00
Peter Boyle
00d0d6d008 Hip Free managed 2020-10-31 18:14:31 -04:00
Peter Boyle
537a9f7030 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-31 18:13:30 -04:00
Peter Boyle
cc9c993f74 Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt 2020-10-31 18:12:47 -04:00
Peter Boyle
d10422ded8 Test project on group 2020-10-31 18:12:30 -04:00
Peter Boyle
f313565a3c HiP compile 2020-10-31 12:12:40 +00:00
Daniel Richtmann
b3881d2636 Thread inversion of clover term 2020-10-30 16:18:58 +01:00
61d5860b46 Merge pull request #318 from rrhodgson/feature/BaryonSpinMat
Added untraced baryon contraction code
2020-10-28 18:39:59 +00:00
52d17987dc BaryonUtils.h updated debug output 2020-10-23 11:41:08 +01:00
19d8bba97d BaryonUtils function naming change 2020-10-21 11:58:53 +01:00
463d72d322 Added untraced baryon contraction code 2020-10-19 16:13:28 +01:00
d060341168 add an error check for Parameters.StartingType 2020-10-16 21:39:17 +09:00
c772bcd514 Merge https://github.com/paboyle/Grid into develop 2020-10-16 20:30:32 +09:00
Peter Boyle
3362f8dfa0 happy compile 2020-10-14 22:59:41 -04:00
Peter Boyle
bf3c9857e0 Closure changes 2020-10-14 21:37:14 -04:00
Peter Boyle
a88b3ceca5 Closure cases 2020-10-14 21:33:51 -04:00
Peter Boyle
aa135412f5 toComplex, toReal 2020-10-13 22:25:01 -04:00
Peter Boyle
9945399e60 Reaality issues fix by drop from ET 2020-10-13 22:24:32 -04:00
Peter Boyle
5eeffa49e8 Reality forced included 2020-10-13 22:23:57 -04:00
Peter Boyle
3f06209720 Pretty print 2020-10-13 22:18:51 -04:00
467deee46f Merge branch 'debug_512' into develop 2020-10-07 15:18:44 +09:00
Christoph Lehner
80fd6ab407 Merge pull request #17 from paboyle/develop
sync upstream
2020-10-06 09:01:39 +02:00
Christoph Lehner
5534921bee Merge pull request #16 from DanielRichtmann/feature/gpt-coarsenedmatrix
Enable checkerboard operations for CoarsenedMatrix
2020-10-01 10:55:13 +02:00
Christoph Lehner
5cffa05c7e remove slab allocator file 2020-09-13 14:06:25 -04:00
Christoph Lehner
d50a2164d7 remove slab allocator 2020-09-13 14:06:06 -04:00
Christoph Lehner
32ff766dbd fix evict scheme, slab alloc 2020-09-13 14:02:53 -04:00
Christoph Lehner
01652d8cfe SlabAllocator 2020-09-13 05:56:02 -04:00
Daniel Richtmann
4d2dc7ba03 Enable even-odd for CoarsenedMatrix 2020-09-11 20:32:02 +02:00
Christoph Lehner
51d1beb1f3 Merge pull request #15 from paboyle/develop
Sync with upstream
2020-09-07 14:20:33 +02:00
Christoph Lehner
249e2db87d Merge pull request #14 from DanielRichtmann/feature/gpt-coarsenedmatrix
Expose more functions in CMat
2020-08-27 15:18:56 +02:00
Daniel Richtmann
cf3535d16e Expose more functions in CMat 2020-08-27 14:06:48 +02:00
Christoph Lehner
d61ee817f4 Merge pull request #13 from DanielRichtmann/feature/gpt-coarsenedmatrix
Changes needed for GPT MG
2020-08-27 12:11:06 +02:00
Christoph Lehner
2a75516330 state MPI/SLURM message only on world_rank zero 2020-08-26 12:34:17 -04:00
Daniel Richtmann
b2087f14c4 Fix CoarsenedMatrix regarding illegal memory accesses
Need a reference to geom since the lambda copies the this pointer which points to host memory, see
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture
- https://devblogs.nvidia.com/new-compiler-features-cuda-8/
2020-08-24 17:46:47 +02:00
Daniel Richtmann
dd1ba266b2 Fix mapping between dir + disp and point in CMat 2020-08-24 17:46:46 +02:00
Daniel Richtmann
1292d59563 Add a typedef + broaden interface of CMat 2020-08-24 17:46:45 +02:00
Christoph Lehner
9877ed9bf8 Merge pull request #12 from paboyle/develop
Sync
2020-08-22 16:35:35 +02:00
Christoph Lehner
f0dc0f3621 fix compile issue on Qpace3 2020-08-22 13:57:33 +02:00
Christoph Lehner
63b0a19f37 Merge pull request #11 from paboyle/develop
Sync
2020-08-20 20:53:39 +02:00
375 changed files with 34942 additions and 19384 deletions

1
.gitignore vendored
View File

@@ -88,6 +88,7 @@ Thumbs.db
# build directory #
###################
build*/*
Documentation/_build
# IDE related files #
#####################

View File

@@ -1,56 +0,0 @@
language: cpp
cache:
directories:
- clang
matrix:
include:
- os: osx
osx_image: xcode8.3
compiler: clang
before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@@ -37,19 +37,29 @@ directory
#endif
//disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488
#endif
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC

View File

@@ -16,6 +16,7 @@
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>
@@ -28,4 +29,7 @@
///////////////////
#include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */

View File

@@ -14,7 +14,11 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
#endif
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")

View File

@@ -21,6 +21,7 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h
endif
all: version-cache Version.h
version-cache:
@@ -53,6 +54,19 @@ Version.h: version-cache
include Make.inc
include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources)

View File

@@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
NAMESPACE_BEGIN(Grid);
@@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
int base = (_d==5) ? 1:0;
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
@@ -72,16 +75,51 @@ public:
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
template<class Fobj,class CComplex,int nbasis>
@@ -224,7 +262,7 @@ public:
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
@@ -258,7 +296,7 @@ public:
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > siteVector;
@@ -268,33 +306,59 @@ public:
typedef iMatrix<CComplex,nbasis > Cobj;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef CoarseVector FermionField;
// enrich interface, use default implementation as in FermionOperator ///////
void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
////////////////////
// Data members
////////////////////
Geometry geom;
GridBase * _grid;
GridBase* _cbgrid;
int hermitian;
CartesianStencil<siteVector,siteVector,int> Stencil;
CartesianStencil<siteVector,siteVector,int> StencilEven;
CartesianStencil<siteVector,siteVector,int> StencilOdd;
std::vector<CoarseMatrix> A;
std::vector<CoarseMatrix> Aeven;
std::vector<CoarseMatrix> Aodd;
CoarseMatrix AselfInv;
CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
GridBase * RedBlackGrid() { return _cbgrid; };
int ConstEE() { return 0; }
void M (const CoarseVector &in, CoarseVector &out)
{
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
@@ -316,14 +380,14 @@ public:
int ptype;
StencilEntry *SE;
for(int point=0;point<geom.npoint;point++){
for(int point=0;point<npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss);
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
@@ -344,12 +408,74 @@ public:
return M(in,out);
} else {
// corresponds to Galerkin coarsening
CoarseVector tmp(Grid());
G5C(tmp, in);
M(tmp, out);
G5C(out, out);
return MdagNonHermitian(in, out);
}
};
void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
{
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++)
points[p] = geom.points_dagger[p];
auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0];
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
void MdirComms(const CoarseVector &in)
{
SimpleCompressor<siteVector> compressor;
@@ -359,6 +485,7 @@ public:
{
conformable(_grid,in.Grid());
conformable(_grid,out.Grid());
out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
@@ -367,6 +494,7 @@ public:
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
autoView( Stencil_v , Stencil, AcceleratorRead);
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
@@ -380,12 +508,12 @@ public:
int ptype;
StencilEntry *SE;
SE=Stencil.GetEntry(ptype,point,ss);
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
@@ -413,34 +541,7 @@ public:
this->MdirComms(in);
int ndim = in.Grid()->Nd();
//////////////
// 4D action like wilson
// 0+ => 0
// 0- => 1
// 1+ => 2
// 1- => 3
// etc..
//////////////
// 5D action like DWF
// 1+ => 0
// 1- => 1
// 2+ => 2
// 2- => 3
// etc..
auto point = [dir, disp, ndim](){
if(dir == 0 and disp == 0)
return 8;
else if ( ndim==4 ) {
return (4 * dir + 1 - disp) / 2;
} else {
return (4 * (dir-1) + 1 - disp) / 2;
}
}();
MdirCalc(in,out,point);
MdirCalc(in,out,geom.point(dir,disp));
};
void Mdiag(const CoarseVector &in, CoarseVector &out)
@@ -449,23 +550,298 @@ public:
MdirCalc(in, out, point); // No comms
};
void Mooee(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerNo, InverseNo);
}
void MooeeInv(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerNo, InverseYes);
}
void MooeeDag(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerYes, InverseNo);
}
void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerYes, InverseYes);
}
void Meooe(const CoarseVector &in, CoarseVector &out) {
if(in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
void MeooeDag(const CoarseVector &in, CoarseVector &out) {
if(in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, A, in, out, dag);
}
void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag);
}
void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag);
}
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard();
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
DselfInternal(StencilOdd, *Aself, in, out, dag);
} else {
Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
DselfInternal(StencilEven, *Aself, in, out, dag);
}
} else {
Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag);
}
assert(Aself != nullptr);
}
void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
const CoarseVector &in, CoarseVector &out, int dag) {
int point = geom.npoint-1;
autoView( out_v, out, AcceleratorWrite);
autoView( in_v, in, AcceleratorRead);
autoView( st_v, st, AcceleratorRead);
autoView( a_v, a, AcceleratorRead);
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
RealD* dag_factor_p = &dag_factor[0];
if(dag) {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
} else {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
}
}
void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
const CoarseVector &in, CoarseVector &out, int dag) {
SimpleCompressor<siteVector> compressor;
st.HaloExchange(in,compressor);
autoView( in_v, in, AcceleratorRead);
autoView( out_v, out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
typedef LatticeView<Cobj> Aview;
// determine in what order we need the points
int npoint = geom.npoint-1;
Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++)
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
RealD* dag_factor_p = &dag_factor[0];
if(dag) {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
} else {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
}
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
_grid(&CoarseGrid),
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid)
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,_cbgrid),
Aodd(geom.npoint,_cbgrid),
AselfInv(&CoarseGrid),
AselfInvEven(_cbgrid),
AselfInvOdd(_cbgrid),
dag_factor(nbasis*nbasis)
{
fillFactor();
};
CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) :
_grid(&CoarseGrid),
_cbgrid(&CoarseRBGrid),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,&CoarseRBGrid),
Aodd(geom.npoint,&CoarseRBGrid),
AselfInv(&CoarseGrid),
AselfInvEven(&CoarseRBGrid),
AselfInvOdd(&CoarseRBGrid),
dag_factor(nbasis*nbasis)
{
fillFactor();
};
void fillFactor() {
Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
if(!hermitian) {
const int nb = nbasis/2;
dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
}
// GPU readable prefactor
thread_for(i, nbasis*nbasis, {
int j = i/nbasis;
int k = i%nbasis;
dag_factor[i] = dag_factor_eigen(j, k);
});
}
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
typedef typename Fobj::scalar_type scalar_type;
std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
@@ -496,11 +872,13 @@ public:
CoarseScalar InnerProd(Grid());
std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
// Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace);
// Compute the matrix elements of linop between this orthonormal
// set of vectors.
std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
int self_stencil=-1;
for(int p=0;p<geom.npoint;p++)
{
@@ -539,7 +917,7 @@ public:
phi=Subspace.subspace[i];
// std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
linop.OpDirAll(phi,Mphi_p);
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
@@ -568,6 +946,18 @@ public:
autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
if ( hermitian && (disp==-1) ) {
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
int dirp = geom.directions[pp];
int dispp = geom.displacements[pp];
if ( (dirp==dir) && (dispp==1) ){
auto sft = conjugate(Cshift(oZProj,dir,1));
autoView( sft_v , sft , AcceleratorWrite);
autoView( A_pp , A[pp], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
}
}
}
}
}
@@ -606,28 +996,54 @@ public:
}
if(hermitian) {
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
ForceHermitian();
}
}
void ForceHermitian(void) {
CoarseMatrix Diff (Grid());
for(int p=0;p<geom.npoint;p++){
int dir = geom.directions[p];
int disp = geom.displacements[p];
if(disp==-1) {
// Find the opposite link
for(int pp=0;pp<geom.npoint;pp++){
int dirp = geom.directions[pp];
int dispp = geom.displacements[pp];
if ( (dirp==dir) && (dispp==1) ){
// Diff = adj(Cshift(A[p],dir,1)) - A[pp];
// std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
A[pp] = adj(Cshift(A[p],dir,1));
}
InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
}
void InvertSelfStencilLink() {
std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
int localVolume = Grid()->lSites();
typedef typename Cobj::scalar_object scalar_object;
autoView(Aself_v, A[geom.npoint-1], CpuRead);
autoView(AselfInv_v, AselfInv, CpuWrite);
thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
scalar_object selfLink = Zero();
scalar_object selfLinkInv = Zero();
Coordinate lcoor;
Grid()->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(selfLink, Aself_v, lcoor);
for (int i = 0; i < nbasis; ++i)
for (int j = 0; j < nbasis; ++j)
selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
selfLinkInvEigen = selfLinkEigen.inverse();
for(int i = 0; i < nbasis; ++i)
for(int j = 0; j < nbasis; ++j)
selfLinkInv(i, j) = selfLinkInvEigen(i, j);
pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
});
}
void FillHalfCbs() {
std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
for(int p = 0; p < geom.npoint; ++p) {
pickCheckerboard(Even, Aeven[p], A[p]);
pickCheckerboard(Odd, Aodd[p], A[p]);
}
pickCheckerboard(Even, AselfInvEven, AselfInv);
pickCheckerboard(Odd, AselfInvOdd, AselfInv);
}
};

View File

@@ -136,7 +136,7 @@ public:
flops=0;
usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
};
~FFT ( void) {
@@ -182,7 +182,7 @@ public:
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils
typedef typename vobj::scalar_object sobj;

View File

@@ -52,6 +52,7 @@ public:
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0;
virtual ~LinearOperatorBase(){};
};
@@ -507,7 +508,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
virtual void MpcDag (const Field &in, Field &out){
Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
virtual void MpcDagMpc(const Field &in, Field &out) {
assert(0);// Never need with staggered
}
};
@@ -530,6 +531,16 @@ public:
template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{
assert(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i)
{
(*this)(in[i], out[i]);
}
}
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
@@ -575,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
@@ -588,6 +600,7 @@ public:
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;

View File

@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
template<class Field> using Preconditioner = LinearFunction<Field> ;
/*
template<class Field> class Preconditioner : public LinearFunction<Field> {
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field & psi)=0;
};
*/
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
void operator()(const Field &src, Field & psi){
using Preconditioner<Field>::operator();
virtual void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};

View File

@@ -48,6 +48,7 @@ public:
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
virtual ~SparseMatrixBase() {};
};
/////////////////////////////////////////////////////////////////////////////////////////////
@@ -72,7 +73,7 @@ public:
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual ~CheckerBoardedSparseMatrixBase() {};
};
NAMESPACE_END(Grid);

View File

@@ -264,7 +264,7 @@ public:
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});

View File

@@ -37,6 +37,7 @@ template<class FieldD, class FieldF, typename std::enable_if< getPrecision<Field
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
{
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;

View File

@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;

View File

@@ -33,16 +33,19 @@ namespace Grid {
template<class Field>
class ZeroGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
};
template<class Field>
class DoNothingGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { };
};
template<class Field>
class SourceGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = src; };
};
@@ -54,15 +57,24 @@ class DeflatedGuesser: public LinearFunction<Field> {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
const unsigned int N;
public:
using LinearFunction<Field>::operator();
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
: DeflatedGuesser(_evec, _eval, _evec.size())
{}
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N)
{
assert(evec.size()==eval.size());
assert(N <= evec.size());
}
virtual void operator()(const Field &src,Field &guess) {
guess = Zero();
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
@@ -79,6 +91,7 @@ private:
const std::vector<RealD> &eval_coarse;
public:
using LinearFunction<FineField>::operator();
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
@@ -100,7 +113,43 @@ public:
blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard();
};
};
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
int Nevec = (int)evec_coarse.size();
int Nsrc = (int)src.size();
// make temp variables
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
//Preporcessing
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
guess_coarse[j] = Zero();
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockProject(src_coarse[j],src[j],subspace);
}
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
for (int i=0;i<Nevec;i++)
{
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
const CoarseField & tmp = evec_coarse[i];
for (int j=0;j<Nsrc;j++)
{
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
}
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
};

View File

@@ -67,6 +67,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@@ -97,6 +98,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field

View File

@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;

View File

@@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
@@ -119,7 +119,8 @@ public:
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
ComplexD a, b;
// ComplexD zAz;
RealD zAAz;
ComplexD rq;
@@ -146,7 +147,7 @@ public:
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
@@ -170,7 +171,7 @@ public:
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
@@ -212,7 +213,7 @@ public:
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();

View File

@@ -132,6 +132,31 @@ namespace Grid {
(*this)(_Matrix,in,out,guess);
}
void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
}
// James can write his own deflated guesser
// with optimised code for the inner products
// RedBlackSolveSplitGrid();
// RedBlackSolve(_Matrix,src_o,sol_o);
void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++) {
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
}
}
template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{
@@ -150,22 +175,27 @@ namespace Grid {
////////////////////////////////////////////////
// Prepare RedBlack source
////////////////////////////////////////////////
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
RedBlackSource(_Matrix,in,src_o);
// for(int b=0;b<nblock;b++){
// RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
// }
////////////////////////////////////////////////
// Make the guesses
////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
if(useSolnAsInitGuess) {
for(int b=0;b<nblock;b++){
pickCheckerboard(Odd, sol_o[b], out[b]);
}
} else {
guess(src_o[b],sol_o[b]);
guess(src_o, sol_o);
}
if ( subGuess ) {
for(int b=0;b<nblock;b++){
guess_save[b] = sol_o[b];
}
}

View File

@@ -1,67 +0,0 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@@ -165,9 +165,18 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
//template<class T> using commAllocator = devAllocator<T>;
#ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid);

View File

@@ -9,14 +9,30 @@ NAMESPACE_BEGIN(Grid);
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
}
//////////////////////////////////////////////////////////////////////
@@ -24,86 +40,114 @@ void MemoryManager::PrintBytes(void)
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#endif
@@ -115,7 +159,6 @@ void MemoryManager::Init(void)
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
@@ -181,13 +224,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
@@ -211,6 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
if ( entries[v].valid ) {
ret = entries[v].address;
cacheBytes -= entries[v].bytes;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
@@ -219,6 +263,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
cacheBytes += bytes;
return ret;
}
@@ -228,13 +273,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
@@ -243,6 +288,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
cacheBytes -= entries[e].bytes;
return entries[e].address;
}
}

View File

@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
@@ -84,14 +82,15 @@ private:
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
@@ -171,6 +170,7 @@ private:
public:
static void Print(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);

View File

@@ -1,11 +1,12 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
@@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
@@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
@@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
@@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
@@ -361,13 +368,16 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
@@ -419,6 +429,7 @@ void MemoryManager::NotifyDeletion(void *_ptr)
}
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@@ -463,6 +474,32 @@ int MemoryManager::isOpen (void* _CpuPtr)
}
}
void MemoryManager::PrintState(void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
} else {
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,7 +1,6 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
@@ -17,6 +16,10 @@ uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::PrintState(void* CpuPtr)
{
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};

View File

@@ -36,7 +36,7 @@ static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);

View File

@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
bool Stencil_force_mpi = true;
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////

View File

@@ -1,4 +1,3 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -36,6 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
class CartesianCommunicator : public SharedMemory {
public:
@@ -108,6 +109,8 @@ public:
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalMax(RealD &);
void GlobalMax(RealF &);
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);

View File

@@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
@@ -358,29 +368,41 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
// if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
// this->StencilSendToRecvFromComplete(list,dir);
// }
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
// std::cout << "Copy Synchronised\n"<<std::endl;
acceleratorCopySynchronise();
int nreq=list.size();
if (nreq==0) return;

View File

@@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalMax(float &){}
void CartesianCommunicator::GlobalMax(double &){}
void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){}

View File

@@ -102,7 +102,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};

View File

@@ -7,6 +7,7 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -34,6 +35,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#endif
NAMESPACE_BEGIN(Grid);
@@ -69,6 +73,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
/////////////////////////////////////////////////////////////////////
@@ -169,6 +174,23 @@ static inline int divides(int a,int b)
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
@@ -428,7 +450,47 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP)
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@@ -452,17 +514,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -472,6 +533,29 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
}
#endif
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
@@ -492,6 +576,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
}
}
#endif
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
@@ -507,7 +592,35 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
@@ -529,6 +642,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
}
WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
@@ -538,6 +652,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#else
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
@@ -665,7 +781,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
@@ -709,16 +824,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_CUDA
cudaMemset(dest,0,bytes);
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorMemSet(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
@@ -771,25 +886,18 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
#ifdef GRID_SHM_FORCE_MPI
// Hide the shared memory path between ranks
{
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
if ( r!=rank ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
}
#endif
SharedMemoryTest();
//SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier

View File

@@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryNone: "
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap
////////////////////////////////////////////////////////////////////////////////////////////
#if 1
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes;
_ShmAlloc=1;
};
#endif
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality

View File

@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
}
}
{
autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@@ -103,21 +110,36 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
} else {
autoView(rhs_v , rhs, AcceleratorRead);
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
accelerator_for2d(n,e1,b,e2,1,{
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
Coordinate coor;
@@ -134,13 +156,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset);
}
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@@ -182,12 +224,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
}
{
autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
}
@@ -208,19 +257,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) {
autoView( rhs_v , rhs, AcceleratorWrite);
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
accelerator_for2d(n,e1,b,e2,1,{
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite);
@@ -280,12 +340,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@@ -324,12 +392,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
}

View File

@@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
@@ -121,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size);
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else {
int words = send_buf.size();
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
@@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
@@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
@@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
}
}
#endif
NAMESPACE_END(Grid);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
//#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>

View File

@@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a));
@@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj);
//GRID_DEF_UNOP(adj, UnaryAdj);
//GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin);
@@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
/////////////////////////////////////////////////////////////
template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))>
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
{
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
return ret;
}
template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
{
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
return ret;
}
template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))>
vecEval(0, expr.arg3)))>::type >
{
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))> ret(expr);
vecEval(0, expr.arg3)))>::type > ret(expr);
return ret;
}
#define EXPRESSION_CLOSURE(function) \

View File

@@ -225,7 +225,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss);
auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]);
coalescedWrite(ret_v[ss],tmp);
});
}

View File

@@ -88,6 +88,13 @@ public:
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
// Helper function to print the state of this object in the AccCache
void PrintCacheState(void)
{
MemoryManager::PrintState(this->_odata);
}
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device

View File

@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
@@ -125,7 +125,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
}
});
@@ -134,7 +134,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
@@ -161,11 +161,13 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
vobj zzz=Zero();
auto B=coalescedRead(zzz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
}
coalescedWrite(result_v[ss], B);
});

View File

@@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = adj(lhs_v[ss]);
});
return ret;
};
@@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
return ret;
};
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid);
#endif

View File

@@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
NAMESPACE_BEGIN(Grid);
@@ -96,12 +99,38 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
ssobj ret = ssum;
return ret;
}
/*
Threaded max, don't use for now
template<class Double>
inline Double max(const Double *arg, Integer osites)
{
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
std::vector<Double> maxarray(nthread);
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
Double max=arg[0];
for(int ss=myoff;ss<mywork+myoff; ss++){
if( arg[ss] > max ) max = arg[ss];
}
maxarray[thr]=max;
});
Double tmax=maxarray[0];
for(int i=0;i<nthread;i++){
if (maxarray[i]>tmax) tmax = maxarray[i];
}
return tmax;
}
*/
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
@@ -110,20 +139,45 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
Integer osites = arg.Grid()->oSites();
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
typename vobj::scalar_object ssum;
autoView( arg_v, arg, AcceleratorRead);
ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites);
auto ssum= sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
@@ -141,6 +195,32 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm);
}
//The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{
typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
Lattice<vscalar> inner = localNorm2(arg);
auto grid = arg.Grid();
RealD max;
for(int l=0;l<grid->lSites();l++){
Coordinate coor;
scalar val;
RealD r;
grid->LocalIndexToLocalCoor(l,coor);
peekLocalSite(val,inner,coor);
r=real(TensorRemove(val));
if( (l==0) || (r>max)){
max=r;
}
}
grid->GlobalMax(max);
return max;
}
// Double inner product
template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -158,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, 1,{
auto x_l = left_v[ss];
@@ -309,6 +388,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid();
assert(grid!=NULL);
@@ -367,20 +447,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
}
// sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
result[t]=lsSum[lt];
} else {
gsum=Zero();
result[t]=Zero();
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
}
template<class vobj>

View File

@@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
#ifdef GRID_CUDA
@@ -37,14 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
/*
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
*/
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
@@ -52,10 +51,14 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
return 0;
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
return 1;
}
template <class sobj, class Iterator>
@@ -195,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
@@ -204,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
@@ -215,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
auto result = buffer_v[0];
return result;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
scalarD *ret_p = (scalarD *)&ret;
const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
if ( ok ) {
ret = sumD_gpu_small(lat,osites);
} else {
ret = sumD_gpu_large(lat,osites);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -227,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,125 @@
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity);
sobj ret ;
Integer nsimd= vobj::Nsimd();
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
return sumD_gpu_tensor(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@@ -85,6 +85,76 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
});
}
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
{
half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite);
autoView(full_v, full, AcceleratorRead);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++) {
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(half_v[ssh],full_v(ss));
}
});
}
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
{
int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead);
autoView(full_v , full, AcceleratorWrite);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++){
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(full_v[ss],half_v(ssh));
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful
// treatment of scalar-compatible types
@@ -97,6 +167,20 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
out = in;
}
template<typename T>
accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
out = in;
}
// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
/*template<typename T1, typename T2>
accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
convertType(T1 & out, const T2 & in) {
out = in;
}*/
#ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
@@ -117,18 +201,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
template<typename T1,typename T2>
accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
convertType(out._internal,in._internal);
}
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
template<typename T1,typename T2>
accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal);
}
template<typename T1,typename T2>
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in);
}
@@ -145,11 +229,6 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
convertType(out._internal[i],in._internal[i]);
}
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
autoView( out_v , out,AcceleratorWrite);
@@ -355,15 +434,21 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead);
auto coarseData_p = &coarseData_[0];
auto fineData_p = &fineData_[0];
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
coarseData_[sc]=Zero();
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){
@@ -374,9 +459,11 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
cd=cd+fineData_p[sf];
}
coarseData_p[sc] = cd;
});
return;
}

View File

@@ -67,8 +67,13 @@ public:
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
#if 1
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
#else
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
#endif
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };

View File

@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
conformable(iftrue,predicate);
conformable(iftrue,ret);
GridBase *grid=iftrue._grid;
GridBase *grid=iftrue.Grid();
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
@@ -52,22 +52,23 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
const int Nsimd = grid->Nsimd();
std::vector<Integer> mask(Nsimd);
std::vector<scalar_object> truevals (Nsimd);
std::vector<scalar_object> falsevals(Nsimd);
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
extract(iftrue._odata[ss] ,truevals);
extract(iffalse._odata[ss] ,falsevals);
extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
for(int s=0;s<Nsimd;s++){
if (mask[s]) falsevals[s]=truevals[s];
}
merge(ret._odata[ss],falsevals);
autoView(iftrue_v,iftrue,CpuRead);
autoView(iffalse_v,iffalse,CpuRead);
autoView(predicate_v,predicate,CpuRead);
autoView(ret_v,ret,CpuWrite);
Integer NN= grid->oSites();
thread_for(ss,NN,{
Integer mask;
scalar_object trueval;
scalar_object falseval;
for(int l=0;l<Nsimd;l++){
trueval =extractLane(l,iftrue_v[ss]);
falseval=extractLane(l,iffalse_v[ss]);
mask =extractLane(l,predicate_v[ss]);
if (mask) falseval=trueval;
insertLane(l,ret_v[ss],falseval);
}
});
}
template<class vobj,class iobj>
@@ -76,9 +77,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
conformable(iftrue,iffalse);
conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue._grid);
Lattice<vobj> ret(iftrue.Grid());
where(ret,predicate,iftrue,iffalse);
whereWolf(ret,predicate,iftrue,iffalse);
return ret;
}

View File

@@ -31,6 +31,7 @@ directory
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <map>
#include <pwd.h>
@@ -123,7 +124,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
////////////////////////////////////////////////////////////
// Helper to fill out metadata
////////////////////////////////////////////////////////////
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
FieldMetaData &header,
scidacRecord & _scidacRecord,
scidacFile & _scidacFile)
@@ -576,6 +577,8 @@ class ScidacReader : public GridLimeReader {
std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
// in principle should do the line below, but that breaks backard compatibility with old data
// skipPastObjectRecord(std::string(GRID_FIELD_NORM));
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return;
}
@@ -619,12 +622,12 @@ class IldgWriter : public ScidacWriter {
// Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present.
////////////////////////////////////////////////////////////////
template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
template <class stats = PeriodicGaugeStatistics>
void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description)
{
GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef Lattice<vLorentzColourMatrixD> GaugeField;
typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj;
////////////////////////////////////////
@@ -636,6 +639,9 @@ class IldgWriter : public ScidacWriter {
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
stats Stats;
Stats(Umu,header);
std::string format = header.floating_point;
header.ensemble_id = description;
header.ensemble_label = description;
@@ -649,7 +655,8 @@ class IldgWriter : public ScidacWriter {
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
ildgfmt.field = std::string("su3gauge");
const std::string stNC = std::to_string( Nc ) ;
ildgfmt.field = std::string("su"+stNC+"gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
@@ -705,10 +712,10 @@ class IldgReader : public GridLimeReader {
// Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present.
////////////////////////////////////////////////////////////////
template <class vsimd>
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
template <class stats = PeriodicGaugeStatistics>
void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef Lattice<vLorentzColourMatrixD > GaugeField;
typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj;
@@ -866,7 +873,8 @@ class IldgReader : public GridLimeReader {
} else {
assert(found_ildgFormat);
assert ( ildgFormat_.field == std::string("su3gauge") );
const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
@@ -874,7 +882,7 @@ class IldgReader : public GridLimeReader {
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);
@@ -921,7 +929,8 @@ class IldgReader : public GridLimeReader {
if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker;
GaugeStatistics(Umu,checker);
stats Stats;
Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;

View File

@@ -6,8 +6,8 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
@@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
GridMetaData(grid,header);
MachineCharacteristics(header);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
template<class Impl>
class GaugeStatistics
{
// How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
public:
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
header.link_trace = WilsonLoops<Impl>::linkTrace(data);
header.plaquette = WilsonLoops<Impl>::avgPlaquette(data);
}
};
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
@@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
@@ -215,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
assert( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){
#if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
cm(mu)()(1,1) = adj(cm(mu)()(0,x)) ;
#else
const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#endif
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -290,7 +282,6 @@ struct GaugeSimpleMunger{
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
@@ -329,8 +320,8 @@ template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
@@ -342,8 +333,8 @@ template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}

View File

@@ -9,6 +9,7 @@
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -30,6 +31,8 @@
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
#include <string>
NAMESPACE_BEGIN(Grid);
using namespace Grid;
@@ -40,6 +43,8 @@ using namespace Grid;
class NerscIO : public BinaryIO {
public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
@@ -129,12 +134,12 @@ public:
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void readConfiguration(GaugeField &Umu,
FieldMetaData& header,
std::string file)
std::string file,
GaugeStats GaugeStatisticsCalculator=GaugeStats())
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -143,33 +148,35 @@ public:
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
const int ieee32big = (format == std::string("IEEE32BIG"));
const int ieee32 = (format == std::string("IEEE32"));
const int ieee64big = (format == std::string("IEEE64BIG"));
const int ieee64 = (format == std::string("IEEE64") || \
format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
const std::string stNC = std::to_string( Nc ) ;
if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
} else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
@@ -177,7 +184,7 @@ public:
assert(0);
}
GaugeStatistics(Umu,clone);
GaugeStats Stats; Stats(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -203,24 +210,33 @@ public:
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
// Preferred interface
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
int two_row,
int bits32)
int bits32,
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
header.sequence_number = sequence_number;
header.ensemble_id = ens_id;
header.ensemble_label = ens_label;
header.hdr_version = "1.0" ;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
@@ -229,15 +245,19 @@ public:
GridMetaData(grid,header);
assert(header.nd==4);
GaugeStatistics(Umu,header);
GaugeStats Stats; Stats(Umu,header);
MachineCharacteristics(header);
uint64_t offset;
// Sod it -- always write 3x3 double
// Sod it -- always write NcxNc double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
const std::string stNC = std::to_string( Nc ) ;
if( two_row ) {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
} else {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
}
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
@@ -245,8 +265,15 @@ public:
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
if( two_row ) {
Gauge3x2unmunger<fobj2D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleUnmunger<fobj3D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
}
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
@@ -279,7 +306,6 @@ public:
MachineCharacteristics(header);
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");

View File

@@ -154,7 +154,7 @@ public:
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
GaugeStatistics(Umu, clone);
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@@ -208,7 +208,7 @@ public:
FieldMetaData clone(header);
GaugeStatistics(Umu, clone);
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@@ -16,8 +16,12 @@
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#endif
#include "pugixml.h"

View File

@@ -80,6 +80,13 @@ template<typename T> struct isSpinor {
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
// ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
@@ -444,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//template <class Prop, class Ferm>
#define FAST_FERM_TO_PROP
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,AcceleratorWrite);
autoView(f_v,f,AcceleratorRead);
accelerator_for(idx,p_v.oSites(),1,{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@@ -458,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
}
pokeSpin(p, pjs, j, s);
}
#endif
}
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,AcceleratorWrite);
autoView(f_v,f,AcceleratorRead);
accelerator_for(idx,p_v.oSites(),1,{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@@ -475,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
}
pokeSpin(f, fj, j);
}
#endif
}
//////////////////////////////////////////////

View File

@@ -41,7 +41,7 @@ class Action
public:
bool is_smeared = false;
// Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name

View File

@@ -68,9 +68,16 @@ public:
///////////////////////////////////////////////////////////////
// Support for MADWF tricks
///////////////////////////////////////////////////////////////
RealD Mass(void) { return mass; };
RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
RealD MassPlus(void) { return mass_plus; };
RealD MassMinus(void) { return mass_minus; };
void SetMass(RealD _mass) {
mass=_mass;
mass_plus=mass_minus=_mass;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void SetMass(RealD _mass_plus, RealD _mass_minus) {
mass_plus=_mass_plus;
mass_minus=_mass_minus;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void P(const FermionField &psi, FermionField &chi);
@@ -108,7 +115,7 @@ public:
void MeooeDag5D (const FermionField &in, FermionField &out);
// protected:
RealD mass;
RealD mass_plus, mass_minus;
// Save arguments to SetCoefficientsInternal
Vector<Coeff_t> _gamma;

View File

@@ -0,0 +1,333 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
////////////////////////////////////////////
// Standard Clover
// (4+m0) + csw * clover_term
// Exp Clover
// (4+m0) * exp(csw/(4+m0) clover_term)
// = (4+m0) + csw * clover_term + ...
////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
//////////////////////////////////
// Generic Standard Clover
//////////////////////////////////
template<class Impl>
class CloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
GridBase *grid = CloverTerm.Grid();
CloverTerm += diag_mass;
int lvol = grid->lSites();
int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
EigenInvCloverOp = EigenCloverOp.inverse();
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Generic Exp Clover
//////////////////////////////////
template<class Impl>
class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef WilsonCloverHelpers<Impl> Helpers;
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
Clover *= (1.0/diag_mass);
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
// prepare inverse
CloverInv = (-1.0)*Clover;
Clover = ExpClover * diag_mass;
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
CloverInv = ExpClover * (1.0/diag_mass);
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
//////////////////////////////////
// Compact Standard Clover
//////////////////////////////////
template<class Impl>
class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
Clover += diag_mass;
}
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
// TODO: implement Cmunu for better performances with compact layout, but don't do it
// here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Compact Exp Clover
//////////////////////////////////
template<class Impl>
class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
Clover *= (1.0/diag_mass);
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
// prepare inverse
CloverInv = (-1.0)*Clover;
Clover = ExpClover * diag_mass;
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
CloverInv = ExpClover * (1.0/diag_mass);
}
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
if (fixedBoundaries)
{
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
else
{
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,241 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
Copyright (C) 2020 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
//
// Modifications done here:
//
// Original: clover term = 12x12 matrix per site
//
// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
//
// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
//
// Words per site and improvement compared to original (combined with the input and output spinors):
//
// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less
// - Here: 2*12 + 42 = 66 words -> 2.55 x less
//
// These improvements directly translate to wall-clock time
//
// Data layout:
//
// - diagonal and triangle part as separate lattice fields,
// this was faster than as 1 combined field on all tested machines
// - diagonal: as expected
// - triangle: store upper right triangle in row major order
// - graphical:
// 0 1 2 3 4
// 5 6 7 8
// 9 10 11 = upper right triangle indices
// 12 13
// 14
// 0
// 1
// 2
// 3 = diagonal indices
// 4
// 5
// 0
// 1 5
// 2 6 9 = lower left triangle indices
// 3 7 10 12
// 4 8 11 13 14
//
// Impact on total memory consumption:
// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
// = 84 complex words per site
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson);
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@@ -115,9 +116,9 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
@@ -137,62 +138,93 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
// Clover fermions
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
// Ls vectorised
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
@@ -235,49 +267,49 @@ typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
//typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
//typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
//typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
@@ -291,12 +323,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_CUDA
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
#endif
NAMESPACE_END(Grid);
////////////////////

View File

@@ -153,8 +153,8 @@ public:
typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; \
typedef typename Impl::StencilImpl::View_type StencilView; \
typedef typename ViewMap<FermionField>::Type FermionFieldView; \
typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
typedef const typename ViewMap<FermionField>::Type FermionFieldView; \
typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
#define INHERIT_IMPL_TYPES(Base) \
INHERIT_GIMPL_TYPES(Base) \
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
/////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec
/////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
NAMESPACE_CHECK(ImplStaggered5dVec);
// Deprecate Vec5d
//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
//NAMESPACE_CHECK(ImplStaggered5dVec);

View File

@@ -97,42 +97,30 @@ public:
Coordinate icoor;
#ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd;
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
//Decide whether we do a G-parity flavor twist
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
//It also assumes (but does not check) that abs(distance) == 1
int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
if ( permute_lane ) {
tmp(0) = chi(1);
tmp(1) = chi(0);
} else {
tmp(0) = chi(0);
tmp(1) = chi(1);
}
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
//Apply the links
int f_upper = permute_lane ? 1 : 0;
int f_lower = !f_upper;
mult(&phi(0),&UU0,&tmp(0));
mult(&phi(1),&UU1,&tmp(1));
} else {
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&chi(0));
mult(&phi(1),&UU1,&chi(1));
}
mult(&phi(0),&UU0,&chi(f_upper));
mult(&phi(1),&UU1,&chi(f_lower));
#else
typedef _Spinor vobj;
@@ -339,8 +327,8 @@ typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> Gparit
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
//typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
//typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
//typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
NAMESPACE_END(Grid);

View File

@@ -85,7 +85,7 @@ class MADWF
maxiter =_maxiter;
};
void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
void operator() (const FermionFieldo &src,FermionFieldo &sol5)
{
std::cout << GridLogMessage<< " ************************************************" << std::endl;
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
@@ -114,8 +114,16 @@ class MADWF
///////////////////////////////////////
//Import source, include Dminus factors
///////////////////////////////////////
Mato.ImportPhysicalFermionSource(src4,b);
std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
GridBase *src_grid = src.Grid();
assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
if ( src_grid == Mato.GaugeGrid() ) {
Mato.ImportPhysicalFermionSource(src,b);
} else {
b=src;
}
std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
defect = b;

View File

@@ -72,19 +72,23 @@ public:
StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
static accelerator_inline void multLink(SiteSpinor &phi,
template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U,
const SiteSpinor &chi,
const _Spinor &chi,
int mu)
{
mult(&phi(), &U(mu), &chi());
auto UU = coalescedRead(U(mu));
mult(&phi(), &UU, &chi());
}
static accelerator_inline void multLinkAdd(SiteSpinor &phi,
template<class _Spinor>
static accelerator_inline void multLinkAdd(_Spinor &phi,
const SiteDoubledGaugeField &U,
const SiteSpinor &chi,
const _Spinor &chi,
int mu)
{
mac(&phi(), &U(mu), &chi());
auto UU = coalescedRead(U(mu));
mac(&phi(), &UU, &chi());
}
template <class ref>

View File

@@ -4,10 +4,11 @@
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
Copyright (C) 2017
Copyright (C) 2017 - 2022
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: David Preti <>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -29,7 +30,9 @@
#pragma once
#include <Grid/Grid.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
@@ -49,19 +52,16 @@ NAMESPACE_BEGIN(Grid);
// csw_r = csw_t to recover the isotropic version
//////////////////////////////////////////////////////////////////
template <class Impl>
class WilsonCloverFermion : public WilsonFermion<Impl>
template<class Impl, class CloverHelpers>
class WilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>
{
public:
// Types definitions
INHERIT_IMPL_TYPES(Impl);
template <typename vtype>
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteCloverType;
typedef Lattice<SiteCloverType> CloverFieldType;
INHERIT_CLOVER_TYPES(Impl);
public:
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
virtual int ConstEE(void) { return 0; };
virtual void Instantiatable(void){};
@@ -72,42 +72,7 @@ public:
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
Fgrid,
Hgrid,
_mass, impl_p, clover_anisotropy),
CloverTerm(&Fgrid),
CloverTermInv(&Fgrid),
CloverTermEven(&Hgrid),
CloverTermOdd(&Hgrid),
CloverTermInvEven(&Hgrid),
CloverTermInvOdd(&Hgrid),
CloverTermDagEven(&Hgrid),
CloverTermDagOdd(&Hgrid),
CloverTermInvDagEven(&Hgrid),
CloverTermInvDagOdd(&Hgrid)
{
assert(Nd == 4); // require 4 dimensions
if (clover_anisotropy.isAnisotropic)
{
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
}
else
{
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if (csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if (csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
const ImplParams &impl_p = ImplParams());
virtual void M(const FermionField &in, FermionField &out);
virtual void Mdag(const FermionField &in, FermionField &out);
@@ -124,250 +89,21 @@ public:
void ImportGauge(const GaugeField &_Umu);
// Derivative parts unpreconditioned pseudofermions
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag);
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
private:
public:
// here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial
RealD csw_t; // Clover coefficient - temporal
RealD diag_mass; // Mass term
CloverFieldType CloverTerm, CloverTermInv; // Clover term
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()();
T_v[i]()(2, 3) = -F_v[i]()();
T_v[i]()(3, 2) = F_v[i]()();
});
return T;
}
CloverFieldType fillCloverXY(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverYT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()());
T_v[i]()(2, 3) = (F_v[i]()());
T_v[i]()(3, 2) = -(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverZT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
CloverField CloverTerm, CloverTermInv; // Clover term
CloverField CloverTermEven, CloverTermOdd; // Clover term EO
CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,763 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
// Helper routines that implement common clover functionality
NAMESPACE_BEGIN(Grid);
template<class Impl> class WilsonCloverHelpers {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
static CloverField fillCloverYZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
});
return T;
}
static CloverField fillCloverXY(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverYT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
});
return T;
}
static CloverField fillCloverZT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
template<class _Spinor>
static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
auto CC = coalescedRead(C);
mult(&phi, &CC, &chi);
}
template<class _SpinorField>
inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
const int Nsimd = SiteSpinor::Nsimd();
autoView(out_v, out, AcceleratorWrite);
autoView(phi_v, phi, AcceleratorRead);
autoView(C_v, C, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multClover(tmp,C_v[sss],phi_v(sss));
coalescedWrite(out_v[sss],tmp);
});
}
};
////////////////////////////////////////////////////////
template<class Impl> class CompactWilsonCloverHelpers {
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
#if 0
static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#else
template<typename vobj>
static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#endif
static accelerator_inline int triangle_index(int i, int j) {
if(i == j)
return 0;
else if(i < j)
return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
else // i > j
return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
}
static void MooeeKernel_gpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(in_v, in, AcceleratorRead);
autoView(out_v, out, AcceleratorWrite);
typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
const uint64_t NN = Nsite * Ls;
accelerator_for(ss, NN, Simd::Nsimd(), {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v(sF);
auto diagonal_t = diagonal_v(sU);
auto triangle_t = triangle_v(sU);
for(int block=0; block<Nhs; block++) {
int s_start = block*Nhs;
for(int i=0; i<Nred; i++) {
int si = s_start + i/Nc, ci = i%Nc;
res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
for(int j=0; j<Nred; j++) {
if (j == i) continue;
int sj = s_start + j/Nc, cj = j%Nc;
res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
};
};
};
coalescedWrite(out_v[sF], res);
});
}
static void MooeeKernel_cpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(in_v, in, CpuRead);
autoView(out_v, out, CpuWrite);
typedef SiteSpinor CalcSpinor;
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
#define PREFETCH_CLOVER(BASE) { \
uint64_t base; \
int pf_dist_L1 = 1; \
int pf_dist_L2 = -5; /* -> penalty -> disable */ \
\
if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
} \
\
if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
} \
}
// TODO: Implement/generalize this for other architectures
// I played around a bit on KNL (see below) but didn't bring anything
// #elif defined(AVX512)
// #define PREFETCH_CLOVER(BASE) { \
// uint64_t base; \
// int pf_dist_L1 = 1; \
// int pf_dist_L2 = +4; \
// \
// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \
// } \
// \
// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \
// } \
// }
#else
#define PREFETCH_CLOVER(BASE)
#endif
const uint64_t NN = Nsite * Ls;
thread_for(ss, NN, {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v[sF];
auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
auto triangle_t = triangle_v[sU];
// upper half
PREFETCH_CLOVER(0);
auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
auto in_cc_1_0 = conjugate(in_t()(1)(0));
auto in_cc_1_1 = conjugate(in_t()(1)(1));
res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
+ triangle_t()(0)( 0) * in_t()(0)(1)
+ triangle_t()(0)( 1) * in_t()(0)(2)
+ triangle_t()(0)( 2) * in_t()(1)(0)
+ triangle_t()(0)( 3) * in_t()(1)(1)
+ triangle_t()(0)( 4) * in_t()(1)(2);
res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
+ triangle_t()(0)( 5) * in_t()(0)(2)
+ triangle_t()(0)( 6) * in_t()(1)(0)
+ triangle_t()(0)( 7) * in_t()(1)(1)
+ triangle_t()(0)( 8) * in_t()(1)(2)
+ conjugate( res()(0)( 1));
res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
+ triangle_t()(0)( 5) * in_cc_0_1;
res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
+ triangle_t()(0)( 9) * in_t()(1)(0)
+ triangle_t()(0)(10) * in_t()(1)(1)
+ triangle_t()(0)(11) * in_t()(1)(2)
+ conjugate( res()(0)( 2));
res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
+ triangle_t()(0)( 6) * in_cc_0_1
+ triangle_t()(0)( 9) * in_cc_0_2;
res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
+ triangle_t()(0)(12) * in_t()(1)(1)
+ triangle_t()(0)(13) * in_t()(1)(2)
+ conjugate( res()(1)( 0));
res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
+ triangle_t()(0)( 7) * in_cc_0_1
+ triangle_t()(0)(10) * in_cc_0_2
+ triangle_t()(0)(12) * in_cc_1_0;
res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
+ triangle_t()(0)(14) * in_t()(1)(2)
+ conjugate( res()(1)( 1));
res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
+ triangle_t()(0)( 8) * in_cc_0_1
+ triangle_t()(0)(11) * in_cc_0_2
+ triangle_t()(0)(13) * in_cc_1_0
+ triangle_t()(0)(14) * in_cc_1_1;
res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
+ conjugate( res()(1)( 2));
vstream(out_v[sF]()(0)(0), res()(0)(0));
vstream(out_v[sF]()(0)(1), res()(0)(1));
vstream(out_v[sF]()(0)(2), res()(0)(2));
vstream(out_v[sF]()(1)(0), res()(1)(0));
vstream(out_v[sF]()(1)(1), res()(1)(1));
vstream(out_v[sF]()(1)(2), res()(1)(2));
// lower half
PREFETCH_CLOVER(1);
auto in_cc_2_0 = conjugate(in_t()(2)(0));
auto in_cc_2_1 = conjugate(in_t()(2)(1));
auto in_cc_2_2 = conjugate(in_t()(2)(2));
auto in_cc_3_0 = conjugate(in_t()(3)(0));
auto in_cc_3_1 = conjugate(in_t()(3)(1));
res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
+ triangle_t()(1)( 0) * in_t()(2)(1)
+ triangle_t()(1)( 1) * in_t()(2)(2)
+ triangle_t()(1)( 2) * in_t()(3)(0)
+ triangle_t()(1)( 3) * in_t()(3)(1)
+ triangle_t()(1)( 4) * in_t()(3)(2);
res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
+ triangle_t()(1)( 5) * in_t()(2)(2)
+ triangle_t()(1)( 6) * in_t()(3)(0)
+ triangle_t()(1)( 7) * in_t()(3)(1)
+ triangle_t()(1)( 8) * in_t()(3)(2)
+ conjugate( res()(2)( 1));
res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
+ triangle_t()(1)( 5) * in_cc_2_1;
res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
+ triangle_t()(1)( 9) * in_t()(3)(0)
+ triangle_t()(1)(10) * in_t()(3)(1)
+ triangle_t()(1)(11) * in_t()(3)(2)
+ conjugate( res()(2)( 2));
res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
+ triangle_t()(1)( 6) * in_cc_2_1
+ triangle_t()(1)( 9) * in_cc_2_2;
res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
+ triangle_t()(1)(12) * in_t()(3)(1)
+ triangle_t()(1)(13) * in_t()(3)(2)
+ conjugate( res()(3)( 0));
res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
+ triangle_t()(1)( 7) * in_cc_2_1
+ triangle_t()(1)(10) * in_cc_2_2
+ triangle_t()(1)(12) * in_cc_3_0;
res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
+ triangle_t()(1)(14) * in_t()(3)(2)
+ conjugate( res()(3)( 1));
res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
+ triangle_t()(1)( 8) * in_cc_2_1
+ triangle_t()(1)(11) * in_cc_2_2
+ triangle_t()(1)(13) * in_cc_3_0
+ triangle_t()(1)(14) * in_cc_3_1;
res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
+ conjugate( res()(3)( 2));
vstream(out_v[sF]()(2)(0), res()(2)(0));
vstream(out_v[sF]()(2)(1), res()(2)(1));
vstream(out_v[sF]()(2)(2), res()(2)(2));
vstream(out_v[sF]()(3)(0), res()(3)(0));
vstream(out_v[sF]()(3)(1), res()(3)(1));
vstream(out_v[sF]()(3)(2), res()(3)(2));
});
}
static void MooeeKernel(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
#else
MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
#endif
}
static void Invert(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv) {
conformable(diagonal, diagonalInv);
conformable(triangle, triangleInv);
conformable(diagonal, triangle);
diagonalInv.Checkerboard() = diagonal.Checkerboard();
triangleInv.Checkerboard() = triangle.Checkerboard();
GridBase* grid = diagonal.Grid();
long lsites = grid->lSites();
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(diagonalInv_v, diagonalInv, CpuWrite);
autoView(triangleInv_v, triangleInv, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_inv_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_inv_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
// TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
else
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
}
}
}
}
clover_inv_eigen = clover_eigen.inverse();
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else if(i < j)
triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else
continue;
}
}
}
}
pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
});
}
static void ConvertLayout(const CloverField& full,
CloverDiagonalField& diagonal,
CloverTriangleField& triangle) {
conformable(full, diagonal);
conformable(full, triangle);
diagonal.Checkerboard() = full.Checkerboard();
triangle.Checkerboard() = full.Checkerboard();
autoView(full_v, full, AcceleratorRead);
autoView(diagonal_v, diagonal, AcceleratorWrite);
autoView(triangle_v, triangle, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else if(i < j)
triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else
continue;
}
}
}
}
});
}
static void ConvertLayout(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverField& full) {
conformable(full, diagonal);
conformable(full, triangle);
full.Checkerboard() = diagonal.Checkerboard();
full = Zero();
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(full_v, full, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
else
full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
}
}
}
}
});
}
static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
// Checks/grid
double t0 = usecond();
conformable(diagonal, triangle);
GridBase* grid = diagonal.Grid();
// Determine the boundary coordinates/sites
double t1 = usecond();
int t_dir = Nd - 1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
// Set off-diagonal parts at boundary to zero -- OK
double t2 = usecond();
CloverTriangleField zeroTriangle(grid);
zeroTriangle.Checkerboard() = triangle.Checkerboard();
zeroTriangle = Zero();
triangle = where(t_coor == 0, zeroTriangle, triangle);
triangle = where(t_coor == T-1, zeroTriangle, triangle);
// Set diagonal to unity (scaled correctly) -- OK
double t3 = usecond();
CloverDiagonalField tmp(grid);
tmp.Checkerboard() = diagonal.Checkerboard();
tmp = -1.0 * csw_t + diag_mass;
diagonal = where(t_coor == 0, tmp, diagonal);
diagonal = where(t_coor == T-1, tmp, diagonal);
// Correct values next to boundary
double t4 = usecond();
if(cF != 1.0) {
tmp = cF - 1.0;
tmp += diagonal;
diagonal = where(t_coor == 1, tmp, diagonal);
diagonal = where(t_coor == T-2, tmp, diagonal);
}
// Report timings
double t5 = usecond();
#if 0
std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
<< " checks = " << (t1 - t0) / 1e6
<< ", coordinate = " << (t2 - t1) / 1e6
<< ", off-diag zero = " << (t3 - t2) / 1e6
<< ", diagonal unity = " << (t4 - t3) / 1e6
<< ", near-boundary = " << (t5 - t4) / 1e6
<< ", total = " << (t5 - t0) / 1e6
<< std::endl;
#endif
}
template<class Field, class Mask>
static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
conformable(f, m);
auto grid = f.Grid();
const uint32_t Nsite = grid->oSites();
const uint32_t Nsimd = grid->Nsimd();
autoView(f_v, f, AcceleratorWrite);
autoView(m_v, m, AcceleratorRead);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, Nsite, Nsimd, {
coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
});
}
template<class MaskField>
static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd);
assert(!full.Grid()->_isCheckerBoarded);
GridBase* grid = full.Grid();
int t_dir = Nd-1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
MaskField zeroMask(grid); zeroMask = Zero();
full = 1.0;
full = where(t_coor == 0, zeroMask, full);
full = where(t_coor == T-1, zeroMask, full);
pickCheckerboard(Even, even, full);
pickCheckerboard(Odd, odd, full);
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,90 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Impl>
class WilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteClover;
typedef Lattice<SiteClover> CloverField;
};
template<class Impl>
class CompactWilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
static constexpr int Nred = Nc * Nhs; // 6
static constexpr int Nblock = Nhs; // 2
static constexpr int Ndiagonal = Nred; // 6
static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
typedef iSinglet<Simd> SiteMask;
typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
typedef Lattice<SiteCloverTriangle> CloverTriangleField;
typedef Lattice<SiteMask> MaskField;
};
#define INHERIT_CLOVER_TYPES(Impl) \
typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal SiteCloverDiagonal; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle SiteCloverTriangle; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteMask SiteMask; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
typedef typename CompactWilsonCloverTypes<Impl>::MaskField MaskField; \
/* ugly duplication but needed inside functionality classes */ \
template<typename vtype> using iImplCloverDiagonal = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
template<typename vtype> using iImplCloverTriangle = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \
static constexpr int Nred = CompactWilsonCloverTypes<Impl>::Nred; \
static constexpr int Nblock = CompactWilsonCloverTypes<Impl>::Nblock; \
static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
NAMESPACE_END(Grid);

View File

@@ -61,18 +61,19 @@ public:
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) {
accelerator_inline int CommDatumSize(void) const {
return sizeof(SiteHalfCommSpinor);
}
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
template<class _SiteHalfSpinor, class _SiteSpinor>
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
_SiteHalfSpinor tmp;
projector::Proj(tmp,in,mu,dag);
vstream(buf[o],tmp);
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
typedef decltype(coalescedRead(buf)) sobj;
sobj sp;
auto sin = coalescedRead(in);
projector::Proj(sp,sin,mu,dag);
coalescedWrite(buf,sp);
}
/*****************************************************/
@@ -81,19 +82,24 @@ public:
accelerator_inline void Exchange(SiteHalfSpinor *mp,
const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor * __restrict__ vp1,
Integer type,Integer o){
Integer type,Integer o) const {
#ifdef GRID_SIMT
exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
#else
SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
vstream(mp[2*o ],tmp1);
vstream(mp[2*o+1],tmp2);
#endif
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor * __restrict__ in, Integer o) {
SiteHalfSpinor * __restrict__ in, Integer o) const {
assert(0);
}
@@ -103,8 +109,30 @@ public:
accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor * __restrict__ out1,
const SiteSpinor * __restrict__ in,
Integer j,Integer k, Integer m,Integer type)
Integer j,Integer k, Integer m,Integer type) const
{
#ifdef GRID_SIMT
typedef SiteSpinor vobj;
typedef SiteHalfSpinor hvobj;
typedef decltype(coalescedRead(*in)) sobj;
typedef decltype(coalescedRead(*out0)) hsobj;
constexpr unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one
const vobj *vp0 = &in[k]; // out0[j] = merge low bit of type from in[k] and in[m]
const vobj *vp1 = &in[m]; // out1[j] = merge hi bit of type from in[k] and in[m]
const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
auto sb = coalescedRead(*vp,j1); // lane to read for out 1
hsobj psa, psb;
projector::Proj(psa,sa,mu,dag); // spin project the result0
projector::Proj(psb,sb,mu,dag); // spin project the result1
coalescedWrite(out0[j],psa);
coalescedWrite(out1[j],psb);
#else
SiteHalfSpinor temp1, temp2;
SiteHalfSpinor temp3, temp4;
projector::Proj(temp1,in[k],mu,dag);
@@ -112,15 +140,17 @@ public:
exchange(temp3,temp4,temp1,temp2,type);
vstream(out0[j],temp3);
vstream(out1[j],temp4);
#endif
}
/*****************************************************/
/* Pass the info to the stencil */
/*****************************************************/
accelerator_inline bool DecompressionStep(void) { return false; }
accelerator_inline bool DecompressionStep(void) const { return false; }
};
#if 0
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
@@ -142,20 +172,30 @@ public:
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) {
accelerator_inline int CommDatumSize(void) const {
return sizeof(SiteHalfCommSpinor);
}
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
template<class _SiteHalfSpinor, class _SiteSpinor>
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
_SiteHalfSpinor hsp;
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
SiteHalfSpinor hsp;
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
projector::Proj(hsp,in,mu,dag);
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
}
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
#ifdef GRID_SIMT
typedef decltype(coalescedRead(buf)) sobj;
sobj sp;
auto sin = coalescedRead(in);
projector::Proj(sp,sin,mu,dag);
coalescedWrite(buf,sp);
#else
projector::Proj(buf,in,mu,dag);
#endif
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
@@ -163,7 +203,7 @@ public:
accelerator_inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1,
Integer type,Integer o){
Integer type,Integer o) const {
SiteHalfSpinor vt0,vt1;
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
@@ -175,7 +215,7 @@ public:
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
}
@@ -186,7 +226,7 @@ public:
accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1,
const SiteSpinor *in,
Integer j,Integer k, Integer m,Integer type){
Integer j,Integer k, Integer m,Integer type) const {
SiteHalfSpinor temp1, temp2,temp3,temp4;
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
@@ -200,9 +240,10 @@ public:
/*****************************************************/
/* Pass the info to the stencil */
/*****************************************************/
accelerator_inline bool DecompressionStep(void) { return true; }
accelerator_inline bool DecompressionStep(void) const { return true; }
};
#endif
#define DECLARE_PROJ(Projector,Compressor,spProj) \
class Projector { \
@@ -253,33 +294,8 @@ public:
typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector;
double timer0;
double timer1;
double timer2;
double timer3;
double timer4;
double timer5;
double timer6;
uint64_t callsi;
void ZeroCountersi(void)
{
timer0=0;
timer1=0;
timer2=0;
timer3=0;
timer4=0;
timer5=0;
timer6=0;
callsi=0;
}
void Reporti(int calls)
{
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
void ZeroCountersi(void) { }
void Reporti(int calls) { }
std::vector<int> surface_list;
@@ -321,26 +337,18 @@ public:
{
std::vector<std::vector<CommsRequest_t> > reqs;
this->HaloExchangeOptGather(source,compress);
double t1=usecond();
// Asynchronous MPI calls multidirectional, Isend etc...
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
this->Communicate();
double t2=usecond(); timer1 += t2-t1;
this->CommsMerge(compress);
double t3=usecond(); timer2 += t3-t2;
this->CommsMergeSHM(compress);
double t4=usecond(); timer3 += t4-t3;
}
template <class compressor>
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
{
this->Prepare();
double t0=usecond();
this->HaloGatherOpt(source,compress);
double t1=usecond();
timer0 += t1-t0;
callsi++;
}
template <class compressor>
@@ -352,12 +360,9 @@ public:
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
this->mpi3synctime_g-=usecond();
this->_grid->StencilBarrier();
this->mpi3synctime_g+=usecond();
assert(source.Grid()==this->_grid);
this->halogtime-=usecond();
this->u_comm_offset=0;
@@ -393,7 +398,6 @@ public:
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
this->halogtime+=usecond();
accelerator_barrier();
}

View File

@@ -72,7 +72,7 @@ public:
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
typedef typename StencilImpl::View_type StencilView;
typedef const typename StencilImpl::View_type StencilView;
ImplParams Params;
@@ -106,11 +106,15 @@ public:
const _SpinorField & phi,
int mu)
{
const int Nsimd = SiteHalfSpinor::Nsimd();
autoView( out_v, out, AcceleratorWrite);
autoView( phi_v, phi, AcceleratorRead);
autoView( Umu_v, Umu, AcceleratorRead);
accelerator_for(sss,out.Grid()->oSites(),1,{
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
coalescedWrite(out_v[sss],tmp);
});
}
@@ -180,18 +184,22 @@ public:
mat = TraceIndex<SpinIndex>(P);
}
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
{
for (int mu = 0; mu < Nd; mu++)
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
{
#undef USE_OLD_INSERT_FORCE
int Ls=Btilde.Grid()->_fdimensions[0];
autoView( mat_v , mat, AcceleratorWrite);
#ifdef USE_OLD_INSERT_FORCE
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
{
const int Nsimd = SiteSpinor::Nsimd();
autoView( tmp_v , tmp, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
@@ -204,6 +212,29 @@ public:
});
}
PokeIndex<LorentzIndex>(mat,tmp,mu);
#else
{
const int Nsimd = SiteSpinor::Nsimd();
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
auto op = outerProduct(bb,aa);
sum = sum + op;
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
#endif
}
};
@@ -212,17 +243,17 @@ typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float

View File

@@ -49,9 +49,17 @@ public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public:
#ifdef GRID_SYCL
#define SYCL_HACK
#endif
#ifdef SYCL_HACK
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
#endif
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;

View File

@@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
FiveDimRedBlackGrid,
FourDimGrid,
FourDimRedBlackGrid,_M5,p),
mass(_mass)
mass_plus(_mass), mass_minus(_mass)
{
}
@@ -209,8 +209,8 @@ void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
Vector<Coeff_t> diag (Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass;
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
M5D(psi,chi,chi,lower,diag,upper);
}
template<class Impl>
@@ -220,8 +220,8 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
Vector<Coeff_t> diag = bs;
Vector<Coeff_t> upper= cs;
Vector<Coeff_t> lower= cs;
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,Din,lower,diag,upper);
}
// FIXME Redunant with the above routine; check this and eliminate
@@ -235,8 +235,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &
upper[i]=-ceo[i];
lower[i]=-ceo[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@@ -250,8 +250,8 @@ void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &
upper[i]=-cee[i];
lower[i]=-cee[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@@ -266,9 +266,9 @@ void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &
// Assemble the 5d matrix
if ( s==0 ) {
upper[s] = -cee[s+1] ;
lower[s] = mass*cee[Ls-1];
lower[s] = mass_minus*cee[Ls-1];
} else if ( s==(Ls-1)) {
upper[s] = mass*cee[0];
upper[s] = mass_plus*cee[0];
lower[s] = -cee[s-1];
} else {
upper[s]=-cee[s+1];
@@ -291,8 +291,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0);
Vector<Coeff_t> lower(Ls,-1.0);
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_plus*upper[Ls-1];
lower[0] =-mass_minus*lower[0];
M5Ddag(psi,chi,chi,lower,diag,upper);
}
@@ -307,9 +307,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
for (int s=0;s<Ls;s++){
if ( s== 0 ) {
upper[s] = cs[s+1];
lower[s] =-mass*cs[Ls-1];
lower[s] =-mass_minus*cs[Ls-1];
} else if ( s==(Ls-1) ) {
upper[s] =-mass*cs[0];
upper[s] =-mass_plus*cs[0];
lower[s] = cs[s-1];
} else {
upper[s] = cs[s+1];
@@ -552,7 +552,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass*cee[Ls-1]/bee[0];
leem[i]=mass_minus*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) {
assert(bee[j+1]!=Coeff_t(0.0));
leem[i]*= aee[j]/bee[j+1];
@@ -560,7 +560,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
ueem[i]=mass;
ueem[i]=mass_plus;
for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
ueem[i]*= aee[0]/bee[0];
@@ -573,7 +573,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
}
{
Coeff_t delta_d=mass*cee[Ls-1];
Coeff_t delta_d=mass_minus*cee[Ls-1];
for(int j=0;j<Ls-1;j++) {
assert(bee[j] != Coeff_t(0.0));
delta_d *= cee[j]/bee[j];
@@ -642,7 +642,11 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
@@ -777,6 +781,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
assert(mu>=0);
assert(mu<Nd);
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if 0
int tshift = (mu == Nd-1) ? 1 : 0;
@@ -826,8 +832,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#if (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
////////////////////////////////////////////////
// GENERAL CAYLEY CASE
////////////////////////////////////////////////
@@ -880,17 +887,29 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
std::vector<RealD> G_s(Ls,1.0);
RealD sign = 1.0; // sign flip for vector/tadpole
if ( curr_type == Current::Axial ) {
for(int s=0;s<Ls/2;s++){
G_s[s] = -1.0;
}
}
else if ( curr_type == Current::Tadpole ) {
auto b=this->_b;
auto c=this->_c;
if ( b == 1 && c == 0 ) {
sign = -1.0;
}
else {
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
assert(b==1 && c==0);
}
}
for(int s=0;s<Ls;s++){
int sp = (s+1)%Ls;
int sr = Ls-1-s;
int srp= (sr+1)%Ls;
// int sr = Ls-1-s;
// int srp= (sr+1)%Ls;
// Mobius parameters
auto b=this->bs[s];
@@ -907,7 +926,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = Cshift(tmp,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
tmp = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated
@@ -922,7 +941,13 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -G_s[s]*( Utmp + gmu*Utmp );
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
// Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
InsertSlice(L_Q, q_out, s , 0);

View File

@@ -0,0 +1,377 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
, DiagonalInv(&Fgrid), TriangleInv(&Fgrid)
, DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
, DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid)
, Tmp(&Fgrid)
, BoundaryMask(&Fgrid)
, BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
if (clover_anisotropy.isAnisotropic)
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
conformable(in, diagonal);
conformable(in, triangle);
CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T );
#undef STAG_VEC5D
#ifdef STAG_VEC5D
// This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
#endif
}
#endif
#define PERMUTE_DIR3 __asm__ ( \

View File

@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
#define LOAD_CHI(b) \
#ifdef GRID_SIMT
#define LOAD_CHI(ptype,b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=ref()()(0);\
Chi_1=ref()()(1);\
Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane); \
Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane); \
Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
#define LOAD_CHI_COMMS(b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=coalescedRead(ref()()(0),lane); \
Chi_1=coalescedRead(ref()()(1),lane); \
Chi_2=coalescedRead(ref()()(2),lane);
#define PERMUTE_DIR(dir) ;
#else
#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b)
#define LOAD_CHI_COMMS(b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=ref()()(0); \
Chi_1=ref()()(1); \
Chi_2=ref()()(2);
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#endif
// To splat or not to splat depends on the implementation
#define MULT(A,UChi) \
auto & ref(U[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
Impl::loadLinkElement(U_02,ref()(0,2)); \
Impl::loadLinkElement(U_12,ref()(1,2)); \
Impl::loadLinkElement(U_22,ref()(2,2)); \
U_00=coalescedRead(ref()(0,0),lane); \
U_10=coalescedRead(ref()(1,0),lane); \
U_20=coalescedRead(ref()(2,0),lane); \
U_01=coalescedRead(ref()(0,1),lane); \
U_11=coalescedRead(ref()(1,1),lane); \
U_21=coalescedRead(ref()(2,1),lane); \
U_02=coalescedRead(ref()(0,2),lane); \
U_12=coalescedRead(ref()(1,2),lane); \
U_22=coalescedRead(ref()(2,2),lane); \
UChi ## _0 = U_00*Chi_0; \
UChi ## _1 = U_10*Chi_0;\
UChi ## _2 = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);
#define MULT_ADD(U,A,UChi) \
auto & ref(U[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
Impl::loadLinkElement(U_02,ref()(0,2)); \
Impl::loadLinkElement(U_12,ref()(1,2)); \
Impl::loadLinkElement(U_22,ref()(2,2)); \
U_00=coalescedRead(ref()(0,0),lane); \
U_10=coalescedRead(ref()(1,0),lane); \
U_20=coalescedRead(ref()(2,0),lane); \
U_01=coalescedRead(ref()(0,1),lane); \
U_11=coalescedRead(ref()(1,1),lane); \
U_21=coalescedRead(ref()(2,1),lane); \
U_02=coalescedRead(ref()(0,2),lane); \
U_12=coalescedRead(ref()(1,2),lane); \
U_22=coalescedRead(ref()(2,2),lane); \
UChi ## _0 += U_00*Chi_0; \
UChi ## _1 += U_10*Chi_0;\
UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
UChi ## _2 += U_22*Chi_2;
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in); \
LOAD_CHI(Perm,in); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else { \
LOAD_CHI(buf); \
LOAD_CHI_COMMS(buf); \
}
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
}
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in); \
LOAD_CHI(Perm,in); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else if ( st.same_node[Dir] ) { \
LOAD_CHI(buf); \
LOAD_CHI_COMMS(buf); \
} \
if (local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \
@@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid);
local = SE->_is_local; \
if ((!local) && (!st.same_node[Dir]) ) { \
nmu++; \
{ LOAD_CHI(buf); } \
{ LOAD_CHI_COMMS(buf); } \
{ MULT_ADD(U,Dir,even); } \
}
#define HAND_DECLARATIONS(Simd) \
Simd even_0; \
Simd even_1; \
Simd even_2; \
Simd odd_0; \
Simd odd_1; \
Simd odd_2; \
\
Simd Chi_0; \
Simd Chi_1; \
Simd Chi_2; \
\
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21; \
Simd U_02; \
Simd U_12; \
Simd U_22;
template <class Impl>
template <int Naik> accelerator_inline
@@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
HAND_DECLARATIONS(Simt);
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
calcSiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
@@ -215,7 +241,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out[sF],result);
coalescedWrite(out[sF],result);
}
}
@@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
HAND_DECLARATIONS(Simt);
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
calcSiteSpinor result;
int offset, ptype, local, perm;
StencilEntry *SE;
@@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
// int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
zeroit(even_0); zeroit(even_1); zeroit(even_2);
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
@@ -294,7 +305,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out[sF],result);
coalescedWrite(out[sF],result);
}
}
@@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
HAND_DECLARATIONS(Simt);
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
calcSiteSpinor result;
int offset, ptype, local;
StencilEntry *SE;
@@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
// int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
zeroit(even_0); zeroit(even_1); zeroit(even_2);
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
int nmu=0;
skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
@@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
out[sF] = out[sF] + result;
coalescedWrite(out[sF] , out(sF)+ result);
}
}
}
@@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI
#undef HAND_DECLARATIONS
NAMESPACE_END(Grid);

View File

@@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
int perm= SE->_permute; \
chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
} else { \
chi_p = &in[SE->_offset]; \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
} else { \
chi_p = &buf[SE->_offset]; \
} \
multLink(Uchi, U[sU], *chi_p, Dir);
acceleratorSynchronise(); \
multLink(Uchi, U[sU], chi, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
} \
int perm= SE->_permute; \
chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], *chi_p, Dir); \
multLink(Uchi, U[sU], chi, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U[sU], *chi_p, Dir); \
chi = coalescedRead(buf[SE->_offset],lane); \
multLink(Uchi, U[sU], chi, Dir); \
}
template <class Impl>
@@ -84,12 +77,14 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag)
{
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
calcSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
// for(int s=0;s<LLs;s++){
//
@@ -118,7 +113,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
if ( dag ) {
Uchi = - Uchi;
}
vstream(out[sF], Uchi);
coalescedWrite(out[sF], Uchi,lane);
}
};
@@ -130,13 +125,16 @@ template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
calcSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew ;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
// for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s;
@@ -165,7 +163,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
if ( dag ) {
Uchi = - Uchi;
}
vstream(out[sF], Uchi);
coalescedWrite(out[sF], Uchi,lane);
}
};
@@ -178,14 +176,17 @@ template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p;
// SiteSpinor chi;
SiteSpinor Uchi;
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
calcSpinor Uchi;
StencilEntry *SE;
int ptype;
int nmu=0;
int skew ;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
// for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s;
@@ -212,10 +213,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( nmu ) {
auto _out = coalescedRead(out[sF],lane);
if ( dag ) {
out[sF] = out[sF] - Uchi;
coalescedWrite(out[sF], _out-Uchi,lane);
} else {
out[sF] = out[sF] + Uchi;
coalescedWrite(out[sF], _out+Uchi,lane);
}
}
}
@@ -261,6 +263,8 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
autoView( UUU_v , UUU, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
@@ -301,6 +305,8 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
autoView( UUU_v , U, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);

View File

@@ -2,12 +2,13 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Copyright (C) 2017
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -33,9 +34,48 @@
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, CloverTerm(&Fgrid)
, CloverTermInv(&Fgrid)
, CloverTermEven(&Hgrid)
, CloverTermOdd(&Hgrid)
, CloverTermInvEven(&Hgrid)
, CloverTermInvOdd(&Hgrid)
, CloverTermDagEven(&Hgrid)
, CloverTermDagOdd(&Hgrid)
, CloverTermInvDagEven(&Hgrid)
, CloverTermInvDagOdd(&Hgrid) {
assert(Nd == 4); // require 4 dimensions
if(clover_anisotropy.isAnisotropic) {
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
} else {
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if(csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if(csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
// *NOT* EO
template <class Impl>
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@@ -49,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
out += temp;
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@@ -64,13 +104,16 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
out += temp;
}
template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
{
double t0 = usecond();
WilsonFermion<Impl>::ImportGauge(_Umu);
double t1 = usecond();
GridBase *grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
double t2 = usecond();
// Compute the field strength terms mu>nu
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@@ -79,99 +122,73 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
double t3 = usecond();
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
CloverTerm = fillCloverYZ(Bx) * csw_r;
CloverTerm += fillCloverXZ(By) * csw_r;
CloverTerm += fillCloverXY(Bz) * csw_r;
CloverTerm += fillCloverXT(Ex) * csw_t;
CloverTerm += fillCloverYT(Ey) * csw_t;
CloverTerm += fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass;
CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r;
CloverTerm += Helpers::fillCloverXZ(By) * csw_r;
CloverTerm += Helpers::fillCloverXY(Bz) * csw_r;
CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
int lvol = _Umu.Grid()->lSites();
int DimRep = Impl::Dimension;
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Coordinate lcoor;
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
for (int site = 0; site < lvol; site++) {
grid->LocalIndexToLocalCoor(site, lcoor);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
peekLocalSite(Qx, CTv, lcoor);
Qxinv = Zero();
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CTIv, lcoor);
}
}
double t4 = usecond();
CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
double t5 = usecond();
// Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
double t6 = usecond();
std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiation = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t6 - t0) / 1e6 << std::endl;
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{
out.Checkerboard() = in.Checkerboard();
CloverFieldType *Clover;
CloverField *Clover;
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
if (dag)
@@ -186,12 +203,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
{
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
}
out = *Clover * in;
Helpers::multCloverField(out, *Clover, in);
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = adj(*Clover) * in;
Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway
}
}
else
@@ -209,29 +226,109 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
// std::cout << "Calling clover term Even" << std::endl;
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
}
out = *Clover * in;
Helpers::multCloverField(out, *Clover, in);
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = *Clover * in;
Helpers::multCloverField(out, *Clover, in);
}
}
} // MooeeInternal
// Derivative parts unpreconditioned pseudofermions
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
{
assert(0);
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
assert(0); // not implemented yet
}

View File

@@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
Copyright (C) 2015
Copyright (C) 2022
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -397,6 +398,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
@@ -408,6 +410,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@@ -420,6 +423,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@@ -596,11 +600,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
Gamma g5(Gamma::Algebra::Gamma5);
conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid());
assert(0);
auto UGrid= this->GaugeGrid();
PropagatorField tmp_shifted(UGrid);
PropagatorField g5Lg5(UGrid);
PropagatorField R(UGrid);
PropagatorField gmuR(UGrid);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
g5Lg5=g5*q_in_1*g5;
tmp_shifted=Cshift(q_in_2,mu,1);
Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
gmuR=gmu*R;
q_out=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
tmp_shifted=Cshift(q_in_1,mu,1);
Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
g5Lg5=g5*g5Lg5*g5;
R=q_in_2;
gmuR=gmu*R;
q_out-=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
}
@@ -614,9 +654,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid());
assert(0);
auto UGrid= this->GaugeGrid();
PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid);
PropagatorField L(UGrid);
PropagatorField zz (UGrid);
zz=Zero();
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
tmp = Cshift(q_in,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
tmp = q_in *lattice_cmplx;
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -( Utmp + gmu*Utmp );
// Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
}
NAMESPACE_END(Grid);

View File

@@ -38,9 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
// undefine everything related to kernels
#include <simd/Fujitsu_A64FX_undef.h>
// enable A64FX body
#define WILSONKERNELSASMBODYA64FX
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
///////////////////////////////////////////////////////////
// If we are A64FX specialise the single precision routine
@@ -63,119 +60,89 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
/////////////////////////////////////////////////////////////////
@@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefine
@@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, double
@@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
// template<> void
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefs
#undef WILSONKERNELSASMBODYA64FX
#include <simd/Fujitsu_A64FX_undef.h>
#endif //A64FXASM

View File

@@ -74,15 +74,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -97,15 +97,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
@@ -121,15 +121,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
@@ -148,15 +148,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -171,15 +171,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -194,15 +194,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef MAYBEPERM
#undef MULT_2SPIN
@@ -228,14 +228,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeF
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -249,14 +249,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -273,15 +273,15 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single
@@ -299,14 +299,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -320,14 +320,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -341,14 +341,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif // VEC 5D
@@ -392,14 +392,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -413,14 +413,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -434,14 +434,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
@@ -459,14 +459,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -480,14 +480,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -501,14 +501,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef MAYBEPERM
#undef MULT_2SPIN
@@ -533,14 +533,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeF
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -554,14 +554,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -577,14 +577,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single
@@ -602,14 +602,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
@@ -623,14 +623,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
@@ -645,14 +645,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif // VEC 5D

View File

@@ -25,6 +25,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// GCC 10 messes up SVE instruction scheduling using -O3, but
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
// performance now is better than armclang 20.2
#ifdef KERNEL_DAG
#define DIR0_PROJ XP_PROJ
#define DIR1_PROJ YP_PROJ
@@ -110,6 +115,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
} \
RECON; \
/*
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
though I expected that it would improve on performance
*/
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
@@ -132,60 +142,48 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
PREFETCH_CHIMU_L2(basep); \
} else { PREFETCH_CHIMU(base); } \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Post comms kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
{ ZERO_PSI;} \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
@@ -193,6 +191,8 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif
{
int nmu;
int local,perm, ptype;
@@ -209,7 +209,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
// int sUn=lo.Reorder(ssn);
int sUn=ssn;
LOCK_GAUGE(0);
#else
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
@@ -295,6 +294,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
// { uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW
@@ -308,6 +312,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW
@@ -321,6 +330,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW
@@ -341,6 +355,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
base = (uint64_t) &out[ss];
basep= st.GetPFInfo(nent,plocal); ent++;
basep = (uint64_t) &out[ssn];
//PREFETCH_RESULT_L1_STORE(base);
RESULT(base,basep);
#ifdef SHOW

View File

@@ -76,7 +76,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define REGISTER
#define LOAD_CHIMU \
#ifdef GRID_SIMT
#define LOAD_CHIMU(Ptype) \
{const SiteSpinor & ref (in[offset]); \
Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane); \
Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane); \
Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane); \
Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane); \
Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane); \
Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane); \
Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane); \
Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane); \
Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane); \
Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane); \
Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane); \
Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane); }
#define PERMUTE_DIR(dir) ;
#else
#define LOAD_CHIMU(Ptype) \
{const SiteSpinor & ref (in[offset]); \
Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\
@@ -91,54 +108,54 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Chimu_31=ref()(3)(1);\
Chimu_32=ref()(3)(2);}
#define LOAD_CHI\
{const SiteHalfSpinor &ref(buf[offset]); \
Chi_00 = ref()(0)(0);\
Chi_01 = ref()(0)(1);\
Chi_02 = ref()(0)(2);\
Chi_10 = ref()(1)(0);\
Chi_11 = ref()(1)(1);\
Chi_12 = ref()(1)(2);}
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00); \
permute##dir(Chi_01,Chi_01); \
permute##dir(Chi_02,Chi_02); \
permute##dir(Chi_10,Chi_10); \
permute##dir(Chi_11,Chi_11); \
permute##dir(Chi_12,Chi_12);
#endif
// To splat or not to splat depends on the implementation
#define MULT_2SPIN(A)\
{auto & ref(U[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00;\
UChi_10 = U_00*Chi_10;\
UChi_01 = U_10*Chi_00;\
UChi_11 = U_10*Chi_10;\
UChi_02 = U_20*Chi_00;\
UChi_12 = U_20*Chi_10;\
UChi_00+= U_01*Chi_01;\
UChi_10+= U_01*Chi_11;\
UChi_01+= U_11*Chi_01;\
UChi_11+= U_11*Chi_11;\
UChi_02+= U_21*Chi_01;\
UChi_12+= U_21*Chi_11;\
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02;\
UChi_10+= U_00*Chi_12;\
UChi_01+= U_10*Chi_02;\
UChi_11+= U_10*Chi_12;\
UChi_02+= U_20*Chi_02;\
U_00=coalescedRead(ref()(0,0),lane); \
U_10=coalescedRead(ref()(1,0),lane); \
U_20=coalescedRead(ref()(2,0),lane); \
U_01=coalescedRead(ref()(0,1),lane); \
U_11=coalescedRead(ref()(1,1),lane); \
U_21=coalescedRead(ref()(2,1),lane); \
UChi_00 = U_00*Chi_00; \
UChi_10 = U_00*Chi_10; \
UChi_01 = U_10*Chi_00; \
UChi_11 = U_10*Chi_10; \
UChi_02 = U_20*Chi_00; \
UChi_12 = U_20*Chi_10; \
UChi_00+= U_01*Chi_01; \
UChi_10+= U_01*Chi_11; \
UChi_01+= U_11*Chi_01; \
UChi_11+= U_11*Chi_11; \
UChi_02+= U_21*Chi_01; \
UChi_12+= U_21*Chi_11; \
U_00=coalescedRead(ref()(0,2),lane); \
U_10=coalescedRead(ref()(1,2),lane); \
U_20=coalescedRead(ref()(2,2),lane); \
UChi_00+= U_00*Chi_02; \
UChi_10+= U_00*Chi_12; \
UChi_01+= U_10*Chi_02; \
UChi_11+= U_10*Chi_12; \
UChi_02+= U_20*Chi_02; \
UChi_12+= U_20*Chi_12;}
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00);\
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10);\
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);
#define LOAD_CHI \
{const SiteHalfSpinor &ref(buf[offset]); \
Chi_00 = coalescedRead(ref()(0)(0),lane); \
Chi_01 = coalescedRead(ref()(0)(1),lane); \
Chi_02 = coalescedRead(ref()(0)(2),lane); \
Chi_10 = coalescedRead(ref()(1)(0),lane); \
Chi_11 = coalescedRead(ref()(1)(1),lane); \
Chi_12 = coalescedRead(ref()(1)(2),lane);}
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
@@ -353,13 +370,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \
{int ptype; \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU; \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
@@ -367,16 +385,48 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON;
RECON; }
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
{ SE=&st_p[DIR+8*ss]; \
auto ptype=st_perm[DIR]; \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \
{ SE=&st_p[DIR+8*ss]; \
auto ptype=st_perm[DIR]; \
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
auto offset = SE->_offset; \
auto perm = SE->_permute; \
LOAD_CHIMU(PERM); \
PROJ; \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
{ int ptype; \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU; \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
@@ -384,57 +434,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
} else if ( st.same_node[DIR] ) { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN(DIR); \
RECON; \
}
} \
acceleratorSynchronise(); }
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
{ int ptype; \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
auto offset = SE->_offset; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
}
} \
acceleratorSynchronise(); }
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out[ss]); \
vstream(ref()(0)(0),result_00); \
vstream(ref()(0)(1),result_01); \
vstream(ref()(0)(2),result_02); \
vstream(ref()(1)(0),result_10); \
vstream(ref()(1)(1),result_11); \
vstream(ref()(1)(2),result_12); \
vstream(ref()(2)(0),result_20); \
vstream(ref()(2)(1),result_21); \
vstream(ref()(2)(2),result_22); \
vstream(ref()(3)(0),result_30); \
vstream(ref()(3)(1),result_31); \
vstream(ref()(3)(2),result_32); \
coalescedWrite(ref()(0)(0),result_00,lane); \
coalescedWrite(ref()(0)(1),result_01,lane); \
coalescedWrite(ref()(0)(2),result_02,lane); \
coalescedWrite(ref()(1)(0),result_10,lane); \
coalescedWrite(ref()(1)(1),result_11,lane); \
coalescedWrite(ref()(1)(2),result_12,lane); \
coalescedWrite(ref()(2)(0),result_20,lane); \
coalescedWrite(ref()(2)(1),result_21,lane); \
coalescedWrite(ref()(2)(2),result_22,lane); \
coalescedWrite(ref()(3)(0),result_30,lane); \
coalescedWrite(ref()(3)(1),result_31,lane); \
coalescedWrite(ref()(3)(2),result_32,lane); \
}
#define HAND_RESULT_EXT(ss) \
if (nmu){ \
{ \
SiteSpinor & ref (out[ss]); \
ref()(0)(0)+=result_00; \
ref()(0)(1)+=result_01; \
ref()(0)(2)+=result_02; \
ref()(1)(0)+=result_10; \
ref()(1)(1)+=result_11; \
ref()(1)(2)+=result_12; \
ref()(2)(0)+=result_20; \
ref()(2)(1)+=result_21; \
ref()(2)(2)+=result_22; \
ref()(3)(0)+=result_30; \
ref()(3)(1)+=result_31; \
ref()(3)(2)+=result_32; \
coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane); \
coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane); \
coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane); \
coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane); \
coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane); \
coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane); \
coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane); \
coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane); \
coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane); \
coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane); \
coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane); \
coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane); \
}
#define HAND_DECLARATIONS(a) \
#define HAND_DECLARATIONS(Simd) \
Simd result_00; \
Simd result_01; \
Simd result_02; \
@@ -467,18 +520,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Simd U_21;
#define ZERO_RESULT \
result_00=Zero(); \
result_01=Zero(); \
result_02=Zero(); \
result_10=Zero(); \
result_11=Zero(); \
result_12=Zero(); \
result_20=Zero(); \
result_21=Zero(); \
result_22=Zero(); \
result_30=Zero(); \
result_31=Zero(); \
result_32=Zero();
zeroit(result_00); \
zeroit(result_01); \
zeroit(result_02); \
zeroit(result_10); \
zeroit(result_11); \
zeroit(result_12); \
zeroit(result_20); \
zeroit(result_21); \
zeroit(result_22); \
zeroit(result_30); \
zeroit(result_31); \
zeroit(result_32);
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
@@ -495,19 +548,54 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
#ifdef SYCL_HACK
template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef iSinglet<Simd> vCplx;
// typedef decltype( coalescedRead( vCplx()()() )) Simt;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
HAND_DECLARATIONS(ignore);
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
#endif
template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@@ -523,14 +611,18 @@ template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
HAND_DECLARATIONS(ignore);
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
int offset,local,perm, ptype;
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@@ -546,13 +638,18 @@ template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
HAND_DECLARATIONS(ignore);
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@@ -570,13 +667,18 @@ template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
HAND_DECLARATIONS(ignore);
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
int offset,local,perm, ptype;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@@ -593,13 +695,19 @@ template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
HAND_DECLARATIONS(ignore);
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
int offset, ptype;
HAND_DECLARATIONS(Simt);
// int offset, ptype;
StencilEntry *SE;
int nmu=0;
ZERO_RESULT;
@@ -618,13 +726,19 @@ template<class Impl> accelerator_inline
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
HAND_DECLARATIONS(ignore);
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
int offset, ptype;
// int offset, ptype;
int nmu=0;
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@@ -682,3 +796,4 @@ NAMESPACE_END(Grid);
#undef HAND_RESULT
#undef HAND_RESULT_INT
#undef HAND_RESULT_EXT
#undef HAND_DECLARATIONS

View File

@@ -416,6 +416,20 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody
}
#define KERNEL_CALL_TMP(A) \
const uint64_t NN = Nsite*Ls; \
auto U_p = & U_v[0]; \
auto in_p = & in_v[0]; \
auto out_p = & out_v[0]; \
auto st_p = st_v._entries_p; \
auto st_perm = st_v._permute_type; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
@@ -445,20 +459,24 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_CUDA
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif
}
@@ -476,22 +494,25 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
acceleratorFenceComputeStream();
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif
acceleratorFenceComputeStream();
}
assert(0 && " Kernel optimisation case not covered ");
}

View File

@@ -0,0 +1,44 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@@ -1 +0,0 @@
../WilsonKernelsInstantiationGparity.cc.master

View File

@@ -1 +0,0 @@
#define IMPLEMENTATION GparityWilsonImplDF

View File

@@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

Some files were not shown because too many files have changed in this diff Show More