1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-23 18:22:02 +01:00

Compare commits

..

211 Commits

Author SHA1 Message Date
6b979f0a69 Dirichlet improvements that I failed to commit 2023-04-04 23:13:17 -04:00
fc4db5e963 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-04-03 18:26:11 -04:00
6252ffaf76 No unified 2023-04-03 18:25:22 -04:00
58e020b62a Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-29 14:37:40 -04:00
a7e1aceeca Compile fix on Nvidia 2023-03-29 14:36:50 -04:00
7212432f43 More careful fencing 2023-03-28 20:10:22 -07:00
4a261fab30 Changes premerge to develop 2023-03-28 20:04:21 -07:00
6af97069b9 Preparing for close of feature/dirichlet
Initial code change review complete
2023-03-28 13:39:44 -07:00
5068413cdb Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 08:35:38 -07:00
71c6960eea Commet 2023-03-28 08:34:24 -07:00
ddf6d5c9e3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 11:33:05 -04:00
900e01f49b Temporary 2023-03-27 21:35:06 -07:00
2376156fbc Merge branch 'develop' into feature/dirichlet 2023-03-27 21:33:50 -07:00
3f2fd49db4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-03-27 17:29:54 -07:00
0efa107cb6 Script update 2023-03-27 17:29:43 -07:00
8feedb4f6f Include files moved 2023-03-27 17:29:21 -07:00
05e562e3d7 Move the copy synch out to stencil and do one per call instead of one per packet 2023-03-27 17:28:38 -07:00
dd3bbb8fa2 MOve the synchronise out to the stencil so one call instead of one call per packet 2023-03-27 17:27:45 -07:00
2fbcf13c46 SYCL fix 2023-03-27 14:25:14 -07:00
4ea48ef0c4 Merge pull request #419 from lehner/feature/gpt
Separate rankSum from sum
2023-03-24 15:42:16 -04:00
5c85774ee3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-24 15:40:57 -04:00
d8a9a745d8 stream synchronise 2023-03-24 15:40:30 -04:00
dcf172da3b Merge pull request #415 from paboyle/feature/block_lanczos22
Feature/block lanczos22
2023-03-24 12:08:16 -04:00
d57ed25071 Merge branch 'feature/dirichlet' into feature/block_lanczos22 2023-03-24 12:08:09 -04:00
546be724e7 Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane
Populate the Cshift_table in the GPU
2023-03-24 12:04:06 -04:00
8a1b9073f9 Mshift update 2023-03-23 15:39:30 -04:00
1a7114d4b9 Temporary algorithm while sorting out mixed prec 2023-03-23 15:38:35 -04:00
3f385f717c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts:
	systems/PVC/benchmarks/run-2tile-mpi.sh
	systems/PVC/config-command
2023-03-23 14:52:53 -04:00
481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
c180a52518 Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-23 10:28:01 -04:00
90130e25e9 TODO list 2023-03-23 10:27:02 -04:00
23298acb81 Merge pull request #424 from giltirn/feature/dirichlet-precchange
Precision change implementation
2023-03-22 23:04:52 -04:00
52384e34cf Discard on construct 2023-03-22 19:40:32 -04:00
d0bb033ea2 Device resident GPU block buffer instead of UVM as hit likely UVM
bug. Code worked on CUDA 11.4 but fails on later drivers (certainly 530.30.02, but need to
find the perlmutter driver version).
2023-03-22 19:07:32 -04:00
c6621806ca Compiling on laptop and running 2023-03-21 17:27:09 -04:00
0b6f0f6d2f Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:06:55 -04:00
b5b759df73 Merge branch 'develop' into feature/dirichlet 2023-03-21 16:05:46 -04:00
7db8dd7a95 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:04:27 -04:00
8b43be39c0 Config command 2023-03-21 16:00:52 -04:00
f17f879206 Test update 2023-03-21 15:59:29 -04:00
68428fceab Integrator update 2023-03-21 15:58:49 -04:00
4135f2dcd1 Compressor 2023-03-21 15:41:41 -04:00
c5bdf61215 AUdit fix 2023-03-21 15:38:39 -04:00
88e218e8ee Stencil updates 2023-03-21 15:37:58 -04:00
0f2b786436 Vector -> vector 2023-03-21 15:36:11 -04:00
e1c326558a COmms improvements 2023-03-21 08:53:56 -07:00
bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
39c0815d9e WriteDiscard 2023-03-21 08:57:29 -04:00
a997d24743 Remove nofma 2023-03-14 12:10:31 -07:00
861e5d7f4c SYCL version update. Why do they keep making incompatible changes 2023-03-14 12:10:02 -07:00
14cc142a14 Warning remove 2023-03-14 12:09:26 -07:00
f36b87deb5 syscall fix 2023-03-14 12:09:00 -07:00
eeb6e0a6e3 Renable cache blocking and efficient UPI type SHM comms 2023-03-14 09:10:27 -07:00
cad5b187dd Cleanup 2023-03-14 09:08:16 -07:00
87697eb07e SHared compile 2023-03-14 09:07:36 -07:00
a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
83d86943db Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string 2023-02-23 13:09:45 -05:00
e82cf1d311 Further prec-change improvements
Mixed prec CG algorithm has been modified to precompute precision change workspaces

As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test
2023-02-23 09:45:29 -05:00
1db58a8acc Precision change improvements
Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation.

In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes.

Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces

Renamed the original precisionChange as precisionChangeOrig

Fixed incorrect pointer offset bug in copyLane

Added a test and a benchmark for precisionChange

Added a test for reliable update CG
2023-02-21 10:52:42 -05:00
920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
796abfad80 Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT
Disable diagnostic pragma warnings for CUDA 12+
2023-01-17 09:34:49 -05:00
ad0270ac8c fix: diagnostic pragma warnings fixed for CUDA 12+ 2023-01-12 12:36:30 +00:00
7d62f1d6d2 Populate the Cshift_table in the GPU
Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
2023-01-11 21:26:25 +00:00
458c943987 merged upstream 2022-12-31 11:16:21 +02:00
88015b0858 Split sum in rankSum and GlobalSum 2022-12-26 10:01:32 +01:00
4ca1bf7cca Added gauge invariance test 2022-12-21 07:23:16 -05:00
2ff868f7a5 CPU open doesn't need to free space 2022-12-20 05:10:23 -05:00
ede02b6883 Memory manager debug Felix case 2022-12-20 05:10:23 -05:00
1822ced302 Bug fix 2022-12-20 05:10:23 -05:00
37ba32776f More logging 2022-12-20 05:10:23 -05:00
99b3697b03 More loggin 2022-12-20 05:10:23 -05:00
43a45ec97b SSC_START 2022-12-20 05:10:23 -05:00
b00a4142e5 A=A fix 2022-12-20 05:10:23 -05:00
3791bc527b Logging pulled in from dirichlet branch 2022-12-20 05:10:23 -05:00
d8c29f5fcf Updated FFT test for PETSc 2022-12-18 12:05:00 -05:00
281f8101fe Matt FFT test 2022-12-17 20:35:33 -05:00
472ed2dd5c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-12-17 20:17:09 -05:00
4f85672674 Simpler test for PETSc 2022-12-17 20:16:11 -05:00
dc747c54be Merge branch 'develop' into feature/dirichlet
Conflicts:
	Grid/qcd/action/fermion/WilsonCompressor.h
	Grid/stencil/Stencil.h
2022-12-13 08:24:58 -05:00
140684d706 Head to head vs HMC 2022-12-13 08:15:38 -05:00
5bb7ba92fa Test for DDHMC force term 2022-12-13 08:15:11 -05:00
b54d0f3c73 Smaller deltaH down to 7000s on t=0.5 trajectory 2022-12-13 08:14:27 -05:00
ff6777a98d Variable depth experiments 2022-12-13 08:13:51 -05:00
07acfe89f2 Merge pull request #417 from rrhodgson/feature/fermtoprop
Feature/fermtoprop
2022-12-06 12:45:03 -05:00
40234f531f FermToProp accelerator_for -> thread_for 2022-12-06 17:34:51 +00:00
d49694f38f PropToFerm fix 2022-12-06 15:48:54 +00:00
dc6a38f177 Minor cleanup 2022-11-30 17:13:12 -05:00
82c1ecf60f Block lanczos added 2022-11-30 16:08:40 -05:00
67f569354e Partial dirichlet changes 2022-11-30 15:51:13 -05:00
97a098636d FermToProp 2022-11-30 15:36:35 -05:00
e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
5fa573dfd3 partial send fix 2022-11-25 00:51:04 -05:00
f6402cb6c4 AUDIT removal 2022-11-25 00:50:33 -05:00
bae6c263dc Audit 2022-11-25 00:47:01 -05:00
d71672dca9 Bug fix 2022-11-25 00:46:35 -05:00
121c9e2ceb Tracing 2022-11-25 00:45:21 -05:00
63a30ae34f Tracing 2022-11-25 00:45:05 -05:00
7d8231ba32 Tracing 2022-11-25 00:44:57 -05:00
b690b1cbe9 Audit 2022-11-25 00:43:57 -05:00
c0fb20fc03 Audit check for wrongly locked data 2022-11-25 00:43:12 -05:00
bc9579dac6 Old code path removed 2022-11-25 00:40:45 -05:00
a5c77f8b95 Tracing moved in order 2022-11-25 00:40:27 -05:00
3dbfce5223 Tests clean build on HIP 2022-11-16 20:15:51 -05:00
e51eaedc56 Making tests compile 2022-11-15 22:58:30 -05:00
e2a938e7f7 GPU happy for compile...? 2022-11-15 17:48:18 -05:00
ddad25211b Extra instantiations 2022-11-15 17:47:52 -05:00
6209120de9 Fix to GPU compile attempt 2022-11-15 17:25:58 -05:00
fe6e8f5ac6 Benchmark_comms fix 2022-11-15 17:00:49 -05:00
ee84dcb400 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-11-15 16:41:55 -05:00
0ae0e5f436 Partial Dirichlet test 2022-11-15 16:40:38 -05:00
e047616571 Multilevel integrator test 2022-11-15 16:39:39 -05:00
1af7572c61 Some test HMCs for DDHMC 2022-11-15 16:38:51 -05:00
653039695b Partial dirichlet changes 2022-11-15 16:37:15 -05:00
ca62abd203 Record some perturbative free field calculation 2022-11-15 16:36:46 -05:00
e74666a09c Double length vector type for fast precision change 2022-11-15 16:34:21 -05:00
45a001e078 Debug compile 2022-11-15 16:27:20 -05:00
0352da34f0 Several deleted files 2022-11-15 16:26:49 -05:00
7d302a525d Natural place for this routine is here 2022-11-15 16:24:55 -05:00
e2e269e03b Partial dirichlet BCs 2022-11-15 16:24:26 -05:00
0db4f1803f Partial dirichlet support 2022-11-15 16:23:41 -05:00
5fe480d81c Generic patch 2022-11-15 16:21:45 -05:00
0566fc6267 Partial Dirichlet 2022-11-15 16:21:24 -05:00
a11c12e2e7 Modifications for partial dirichlet BCs 2022-11-15 16:20:01 -05:00
0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
1873101362 PVC 2022-11-08 13:22:45 -08:00
63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
006268f556 DWF Slow version 2022-11-02 20:24:51 -04:00
78acae9b50 Simple DWF for easy check 2022-11-02 20:24:17 -04:00
a3927a8a27 Dirichlet 2022-11-02 20:22:27 -04:00
d9dd9a5b5f LLVM update 2022-11-02 19:51:50 -04:00
eae1c02111 Bounds check 2022-11-02 19:50:32 -04:00
132d841b05 Compile fix 2022-11-02 19:33:22 -04:00
62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
2e8c3b0ddb Slow implementation of Shamir DWF 2022-10-18 18:10:01 -04:00
991667ba5e Revert 2022-10-13 18:50:35 -04:00
8a07b52009 Dirichlet 2022-10-13 18:44:47 -04:00
2bcff94b52 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-13 18:42:04 -04:00
d089739e2f Hack for lattice sites 2022-10-13 17:55:50 -04:00
204c283e16 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-11 14:59:07 -04:00
551a5f8dc8 RRII gpu option 2022-10-11 14:44:55 -04:00
c82b164f6b Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-04 17:41:48 -04:00
584a3ee45c Merge pull request #412 from giltirn/patch/adaptive-wflow
Patch/adaptive wflow
2022-10-04 17:23:19 -04:00
eec0c9eb7d Merge pull request #411 from giltirn/patch/dirichlet-fixes
Various fixes / changes
2022-10-04 17:22:01 -04:00
477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
66d001ec9e Refactored Wilson flow class; previously the class implemented both iterative and adaptive smearing, but only the iterative method was accessible through the Smearing base class. The implementation of Smearing also forced a clunky need to pass iterative smearing parameters through the constructor but adaptive smearing parameters through the function call. Now there is a WilsonFlowBase class that implements common functionality, and separate WilsonFlow (iterative) and WilsonFlowAdaptive (adaptive) classes, both of which implement Smearing virtual functions.
Modified the Wilson flow adaptive smearing step size update to implement the original Ramos definition of the distance, where previously it used the norm of a difference which scales with the volume and so would choose too coarse or too fine steps depending on the volume. This is based on Chulwoo's code.

Added a test comparing adaptive (with tuneable tolerance) to iterative Wilson flow smearing on a random gauge configuration.
2022-10-03 10:59:38 -04:00
fad2f969d9 Summit up to date 2022-09-27 10:58:43 -04:00
48165c1dc1 Ticked off a few items 2022-09-27 10:58:00 -04:00
25df2d2c3b Various precision options 2022-09-27 10:57:12 -04:00
af9ecb8b41 Current tests compiling 2022-09-27 10:56:55 -04:00
234324599e Double2 2022-09-27 10:56:10 -04:00
97448a93dc Double2 compiles and dslash runs 2022-09-27 10:55:25 -04:00
70c83ec3be More instantiations 2022-09-27 10:54:23 -04:00
8f4e2ee545 Double2 2022-09-27 10:53:46 -04:00
e8bfbf2f7c D2 operators 2022-09-27 10:37:45 -04:00
9e81b42981 D2 fields 2022-09-27 10:37:19 -04:00
6c9eef9726 D2 fields 2022-09-27 10:36:54 -04:00
7ffbc3e98e Double2 improved. REally don't like 'convertType' - localise to a GPT
header
2022-09-27 10:35:31 -04:00
68e4d833dd Run through wrapper script 2022-09-23 16:49:29 -04:00
a2cefaa53a Faster 2022-09-23 16:49:14 -04:00
a0d682687e Better logging of Fdt for force gradient 2022-09-23 16:22:53 -04:00
eb552c3ecd dt info 2022-09-23 16:22:28 -04:00
97cce103d7 Tolerances control 2022-09-23 16:21:49 -04:00
87ac7104f8 Prettier 2022-09-23 16:20:46 -04:00
e4c117aabf Compile fix, multishift mixed prec support 2022-09-23 16:19:27 -04:00
5b128a6f9f MixedPrec Multishift with better precision scheme for GPU 2022-09-23 16:18:47 -04:00
19da647e3c Added support for non-periodic gauge field implementations in the random gauge shift performed at the start of the HMC trajectory
(The above required exposing the gauge implementation to the HMC class through the Integrator class)
Made the random shift optional (default on) through a parameter in HMCparameters
Modified ConjugateBC::CshiftLink such that it supports any shift in  -L < shift < L rather than just +-1
Added a tester for the BC-respecting Cshift
Fixed a missing system header include in SSE4 intrinsics wrapper
Fixed sumD_cpu for single-prec types performing an incorrect conversion to a single-prec data type at the end, that fails to compile on some systems
2022-09-09 12:47:09 -04:00
1713de35c0 Improved config flags 2022-09-05 21:50:02 -04:00
1177b8f661 Merge branch 'develop' into feature/dirichlet 2022-08-31 19:05:57 -04:00
442bfb3d42 Merge branch 'develop' into feature/dirichlet 2022-08-31 19:04:19 -04:00
913fbca74a Merge pull request #410 from gkanwar/photon_and_sha_patches
Photon.h and SHA256 patches
2022-08-31 18:01:45 -04:00
60dfb49afa Remove FP16 tests when FP16 is disabled 2022-08-21 17:29:55 +02:00
554c238359 Update OpenSSL digest to use high-level methods
This avoids deprecation warnings when compiling against OpenSSL 3.0
but should still be backwards compatible. It is the recommended way
to use the digest API going forward.
2022-08-21 17:28:57 +02:00
f922adf05e Fix Photon ComplexField type 2022-08-21 16:16:18 +02:00
188d2c7a4d PVC default, ignore ATS 2022-08-02 08:38:53 -07:00
17d7177105 Files for SYCL 2022-08-02 08:33:39 -07:00
bb0a0da47a inon blocking caution due to SYCL 2022-08-02 08:09:43 -07:00
84110166e4 Fix the fence 2022-08-02 08:00:43 -07:00
d32b923b6c Fencing on a stream in SYCL is needed. Didn't know that ... gulp 2022-08-02 07:58:04 -07:00
2ab1af5754 Ensure no synchronize and not optoin dependent 2022-07-19 09:51:06 -07:00
5f8892bf03 Mistake pointed out by Camilo 2022-07-19 09:31:51 -07:00
f14e7e51e7 Grid accelerator 2022-07-12 10:56:22 -07:00
042ab1a052 Update GridStd.h 2022-06-27 13:21:39 -04:00
2df98a99bc Merge pull request #406 from giordano/patch-1
Update default value of gen-simd-width in README
2022-06-14 17:46:25 -04:00
315ea18be2 Update default value of gen-simd-width in README 2022-06-14 22:41:05 +01:00
a9c2e1df03 Merge pull request #404 from rrhodgson/feature/json_nvcc
Feature/json nvcc
2022-05-25 13:30:11 -04:00
da4daea57a Updated json to latest release 3.10.5 2022-05-24 16:16:06 +01:00
e346154c5d Updated json CUDA compile guards 2022-05-24 15:48:01 +01:00
3ca0de1c40 Fix json write for vector<string> 2022-05-24 14:37:33 +01:00
c7205d2a73 Removed nvcc guards for json 2022-05-24 14:30:26 +01:00
317 changed files with 28136 additions and 13525 deletions

View File

@ -45,7 +45,7 @@ directory
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register" #pragma clang diagnostic ignored "-Wdeprecated-register"
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero #pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type

View File

@ -44,10 +44,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridStd.h> #include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h> #include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h> #include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/Tracing.h>
//#include <Grid/perfmon/PerfCount.h> //#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/perfmon/Tracing.h>
#include <Grid/allocator/Allocator.h> #include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/ThreadReduction.h> #include <Grid/threads/ThreadReduction.h>

View File

@ -14,7 +14,7 @@
/* NVCC save and restore compile environment*/ /* NVCC save and restore compile environment*/
#ifdef __NVCC__ #ifdef __NVCC__
#pragma push #pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress code_is_unreachable #pragma nv_diag_suppress code_is_unreachable
#else #else
#pragma diag_suppress code_is_unreachable #pragma diag_suppress code_is_unreachable

View File

@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>

View File

@ -262,7 +262,7 @@ public:
autoView( Tnp_v , (*Tnp), AcceleratorWrite); autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite); autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
}); });
@ -324,9 +324,9 @@ public:
GridBase* _cbgrid; GridBase* _cbgrid;
int hermitian; int hermitian;
CartesianStencil<siteVector,siteVector,int> Stencil; CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil;
CartesianStencil<siteVector,siteVector,int> StencilEven; CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
CartesianStencil<siteVector,siteVector,int> StencilOdd; CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;
std::vector<CoarseMatrix> Aeven; std::vector<CoarseMatrix> Aeven;
@ -631,7 +631,7 @@ public:
assert(Aself != nullptr); assert(Aself != nullptr);
} }
void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a, void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
const CoarseVector &in, CoarseVector &out, int dag) { const CoarseVector &in, CoarseVector &out, int dag) {
int point = geom.npoint-1; int point = geom.npoint-1;
autoView( out_v, out, AcceleratorWrite); autoView( out_v, out, AcceleratorWrite);
@ -694,7 +694,7 @@ public:
} }
} }
void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a, void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
const CoarseVector &in, CoarseVector &out, int dag) { const CoarseVector &in, CoarseVector &out, int dag) {
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
@ -784,9 +784,9 @@ public:
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)), _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
hermitian(hermitian_), hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0), StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0), StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid), A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,_cbgrid), Aeven(geom.npoint,_cbgrid),
Aodd(geom.npoint,_cbgrid), Aodd(geom.npoint,_cbgrid),
@ -804,9 +804,9 @@ public:
_cbgrid(&CoarseRBGrid), _cbgrid(&CoarseRBGrid),
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
hermitian(hermitian_), hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0), StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0), StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid), A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,&CoarseRBGrid), Aeven(geom.npoint,&CoarseRBGrid),
Aodd(geom.npoint,&CoarseRBGrid), Aodd(geom.npoint,&CoarseRBGrid),

View File

@ -526,6 +526,7 @@ public:
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
}; };
virtual ~OperatorFunction(){};
}; };
template<class Field> class LinearFunction { template<class Field> class LinearFunction {

View File

@ -258,26 +258,12 @@ public:
for(int n=2;n<order;n++){ for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y); Linop.HermOp(*Tn,y);
#if 0
auto y_v = y.View();
auto Tn_v = Tn->View();
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#else
axpby(y,xscale,mscale,y,(*Tn)); axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm)); axpby(*Tnp,2.0,-1.0,y,(*Tnm));
if ( Coeffs[n] != 0.0) { if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out); axpy(out,Coeffs[n],*Tnp,out);
} }
#endif
// Cycle pointers to avoid copies // Cycle pointers to avoid copies
Field *swizzle = Tnm; Field *swizzle = Tnm;
Tnm =Tn; Tnm =Tn;

View File

@ -191,7 +191,7 @@ public:
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid);
GridStopWatch PrecChangeTimer; GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d); precisionChange(src_f, src_d, pc_wk_dp_to_sp);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
sol_f = Zero(); sol_f = Zero();
@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution //Convert sol back to double and add to double prec solution
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f); precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d); axpy(sol_d, 1.0, tmp_d, sol_d);

View File

@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChangeFast(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChangeFast(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChangeFast(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChangeFast(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChangeFast(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);

View File

@ -81,6 +81,7 @@ public:
using OperatorFunction<FieldD>::operator(); using OperatorFunction<FieldD>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@ -95,9 +96,9 @@ public:
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts, ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f, GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq int _ReliableUpdateFreq) :
) : MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq) MaxIterations(20000)
{ {
verbose=1; verbose=1;
IterationsToCompleteShift.resize(_shifts.order); IterationsToCompleteShift.resize(_shifts.order);
@ -130,6 +131,9 @@ public:
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec"); GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid(); GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction" // Convenience references to the info stored in "MultiShiftFunction"
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -154,6 +158,7 @@ public:
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; RealD bs[nshift];
RealD rsq[nshift]; RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2]; RealD z[nshift][2];
int converged[nshift]; int converged[nshift];
@ -164,12 +169,8 @@ public:
RealD cp,bp,qq; //prev RealD cp,bp,qq; //prev
// Matrix mult fields // Matrix mult fields
FieldF r_f(SinglePrecGrid);
FieldF p_f(SinglePrecGrid); FieldF p_f(SinglePrecGrid);
FieldF tmp_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid); FieldF mmp_f(SinglePrecGrid);
FieldF src_f(SinglePrecGrid);
precisionChange(src_f, src_d);
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
@ -194,18 +195,26 @@ public:
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s]; rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl; std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d; ps_d[s] = src_d;
} }
// r and p for primary // r and p for primary
r_f=src_f; //residual maintained in single
p_f=src_f;
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0] //MdagM+m[0]
precisionChange(p_f, p_d, pc_wk_d_to_s);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
axpy(mmp_f,mass[0],p_f,mmp_f); precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
RealD rn = norm2(p_f); Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0]; d += rn*mass[0];
b = -cp /d; b = -cp /d;
@ -223,7 +232,7 @@ public:
// r += b[0] A.p[0] // r += b[0] A.p[0]
// c= norm(r) // c= norm(r)
c=axpy_norm(r_f,b,mmp_f,r_f); c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) { for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d); axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
@ -239,14 +248,9 @@ public:
// Iteration loop // Iteration loop
int k; int k;
for (k=1;k<=MaxIterations;k++){ for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp; a = c /cp;
//Update double precision search direction by residual
PrecChangeTimer.Start();
precisionChange(r_d, r_f);
PrecChangeTimer.Stop();
AXPYTimer.Start(); AXPYTimer.Start();
axpy(p_d,a,p_d,r_d); axpy(p_d,a,p_d,r_d);
@ -263,24 +267,28 @@ public:
AXPYTimer.Stop(); AXPYTimer.Stop();
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
cp=c; cp=c;
MatrixTimer.Start(); MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f); Linop_f.HermOp(p_f,mmp_f);
d=real(innerProduct(p_f,mmp_f));
MatrixTimer.Stop(); MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
PrecChangeTimer.Stop();
AXPYTimer.Start(); AXPYTimer.Start();
axpy(mmp_f,mass[0],p_f,mmp_f); d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
AXPYTimer.Stop(); AXPYTimer.Stop();
RealD rn = norm2(p_f); RealD rn = norm2(p_d);
d += rn*mass[0]; d += rn*mass[0];
bp=b; bp=b;
b=-cp/d; b=-cp/d;
// Toggle the recurrence history // Toggle the recurrence history
bs[0] = b; bs[0] = b;
iz = 1-iz; iz = 1-iz;
@ -306,12 +314,12 @@ public:
} }
//Perform reliable update if necessary; otherwise update residual from single-prec mmp //Perform reliable update if necessary; otherwise update residual from single-prec mmp
RealD c_f = axpy_norm(r_f,b,mmp_f,r_f); c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop(); AXPYTimer.Stop();
c = c_f;
if(k % ReliableUpdateFreq == 0){ if(k % ReliableUpdateFreq == 0){
RealD c_old = c;
//Replace r with true residual //Replace r with true residual
MatrixTimer.Start(); MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d); Linop_d.HermOp(psi_d[0],mmp_d);
@ -320,15 +328,10 @@ public:
AXPYTimer.Start(); AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d); axpy(mmp_d,mass[0],psi_d[0],mmp_d);
RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d); c = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop(); AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl; std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
c = c_d;
} }
// Convergence checks // Convergence checks
@ -340,7 +343,7 @@ public:
RealD css = c * z[s][iz]* z[s][iz]; RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){ if(css<rsqf[s]){
if ( ! converged[s] ) if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl; std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1; converged[s]=1;
@ -351,12 +354,17 @@ public:
} }
} }
if ( all_converged ){ if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop(); SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl; if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers // Check answers
for(int s=0; s < nshift; s++) { for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq); Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@ -397,12 +405,10 @@ public:
return; return;
} }
} }
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0); assert(0);
} }
}; };

View File

@ -48,7 +48,7 @@ public:
LinearOperatorBase<FieldF> &Linop_f; LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d; LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid; GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback; LinearOperatorBase<FieldF> *Linop_fallback;
@ -65,7 +65,9 @@ public:
ErrorOnNoConverge(err_on_no_conv), ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{}; {
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback; Linop_fallback = &_Linop_fallback;
@ -116,9 +118,12 @@ public:
} }
//Single prec initialization //Single prec initialization
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard(); r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r); precisionChange(r_f, r, pc_wk_dp_to_sp);
FieldF psi_f(r_f); FieldF psi_f(r_f);
psi_f = Zero(); psi_f = Zero();
@ -134,7 +139,8 @@ public:
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer; GridStopWatch MatrixTimer;
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
GridStopWatch PrecChangeTimer;
SolverTimer.Start(); SolverTimer.Start();
int k = 0; int k = 0;
int l = 0; int l = 0;
@ -173,7 +179,9 @@ public:
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution //Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f); PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp; psi = psi + mmp;
@ -194,7 +202,10 @@ public:
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
@ -214,14 +225,21 @@ public:
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate " std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n"; << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f); PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp; psi = psi + mmp;
MatrixTimer.Start();
Linop_d.HermOpAndNorm(psi, mmp, d, qq); Linop_d.HermOpAndNorm(psi, mmp, d, qq);
MatrixTimer.Stop();
r = src - mmp; r = src - mmp;
psi_f = Zero(); psi_f = Zero();
precisionChange(r_f, r); PrecChangeTimer.Start();
precisionChange(r_f, r, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
cp = norm2(r); cp = norm2(r);
MaxResidSinceLastRelUp = cp; MaxResidSinceLastRelUp = cp;

File diff suppressed because it is too large Load Diff

View File

@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/ /*Allocation types, saying which pointer cache should be used*/
#define Cpu (0) #define Cpu (0)
#define CpuSmall (1) #define CpuHuge (1)
#define Acc (2) #define CpuSmall (2)
#define AccSmall (3) #define Acc (3)
#define Shared (4) #define AccHuge (4)
#define SharedSmall (5) #define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE #undef GRID_MM_VERBOSE
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
} }
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches // Data tables for recently freed pooiniter caches
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType]; int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils // Actual allocation and deallocation utils
@ -170,6 +176,16 @@ void MemoryManager::Init(void)
} }
} }
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL"); str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif #endif
#ifdef GRID_UVM #ifdef GRID_UVM
@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); int cache;
int cache = type + small; if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else #else
return ptr; return ptr;
@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr;
void * ret = NULL; void * ret = NULL;
int v = -1; int v = -1;
@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); int cache;
int cache = type+small; if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else #else
return NULL; return NULL;
@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif

View File

@ -35,6 +35,12 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h? // Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096) #define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
#define AUDIT(a) MemoryManager::Audit(FILE_LINE)
/*Pinning pages is costly*/ /*Pinning pages is costly*/
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@ -65,6 +71,21 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now CpuWriteDiscard = 0x10 // same for now
}; };
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager { class MemoryManager {
private: private:
@ -78,7 +99,7 @@ private:
} AllocationCacheEntry; } AllocationCacheEntry;
static const int NallocCacheMax=128; static const int NallocCacheMax=128;
static const int NallocType=6; static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType]; static int Victim[NallocType];
static int Ncache[NallocType]; static int Ncache[NallocType];
@ -92,8 +113,9 @@ private:
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ; static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public: public:
static void PrintBytes(void);
static void Audit(std::string s);
static void Init(void); static void Init(void);
static void InitMessage(void); static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes); static void *AcceleratorAllocate(size_t bytes);
@ -113,7 +135,28 @@ private:
static uint64_t DeviceToHostBytes; static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer; static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer; static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private: private:
#ifndef GRID_UVM #ifndef GRID_UVM
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -170,6 +213,7 @@ private:
public: public:
static void Print(void); static void Print(void);
static void PrintAll(void);
static void PrintState( void* CpuPtr); static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr); static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode); static void ViewClose(void* CpuPtr,ViewMode mode);

View File

@ -8,9 +8,8 @@ NAMESPACE_BEGIN(Grid);
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout); #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) //#define dprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@ -29,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer; uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
//////////////////////////////////// ////////////////////////////////////
// Priority ordering for unlocked entries // Priority ordering for unlocked entries
@ -116,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) { if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
@ -127,26 +130,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::Evict(AcceleratorViewEntry &AccCache) void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{ {
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry // Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
// Cannot be locked. If allocated must be in LRU pool. // Cannot be acclocked. If allocated must be in LRU pool.
//
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
// but there is a weakness where CpuLock entries are attempted for erase
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
assert(AccCache.accLock==0); (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
assert(AccCache.cpuLock==0); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return;
if (AccCache.cpuLock!=0) return;
if(AccCache.state==AccDirty) { if(AccCache.state==AccDirty) {
Flush(AccCache); Flush(AccCache);
} }
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) { if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); DeviceEvictions++;
// EntryErase(CpuPtr);
} }
void MemoryManager::Flush(AcceleratorViewEntry &AccCache) void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{ {
@ -197,6 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
@ -208,6 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -218,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
assert(LRU.size()>0); assert(LRU.size()>0);
uint64_t victim = LRU.back(); uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim); auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
Evict(AccCache); Evict(AccCache);
} else {
return;
} }
} }
} }
@ -247,11 +265,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld\n", dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
(uint64_t)bytes); (uint64_t)bytes,
(uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes); assert(AccCache.bytes ==bytes);
} }
@ -286,6 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@ -298,28 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
} else { } else {
assert(0); assert(0);
} }
// If view is opened on device remove from LRU assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
LRUremove(AccCache); LRUremove(AccCache);
} }
@ -340,10 +362,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
assert(AccCache.accLock>0); assert(AccCache.accLock>0);
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@ -380,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) { // CPU doesn't need to free space
EvictVictims(bytes); // if (!AccCache.AccPtr) {
} // EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite)); assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error assert(AccCache.accLock==0); // Programming error
@ -436,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr)
void MemoryManager::Print(void) void MemoryManager::Print(void)
{ {
PrintBytes(); PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl; std::cout << GridLogMessage << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl; std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl; std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl; std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl; std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl; std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl; std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl; std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl; std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}
void MemoryManager::PrintAll(void)
{
Print();
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){ for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second; auto &AccCache = it->second;
@ -459,13 +492,13 @@ void MemoryManager::Print(void)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty"); if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent"); if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock << "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock << "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl; << "\t" << AccCache.LRU_valid<<std::endl;
} }
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}; };
int MemoryManager::isOpen (void* _CpuPtr) int MemoryManager::isOpen (void* _CpuPtr)
@ -479,6 +512,64 @@ int MemoryManager::isOpen (void* _CpuPtr)
return 0; return 0;
} }
} }
void MemoryManager::Audit(std::string s)
{
uint64_t CpuBytes=0;
uint64_t AccBytes=0;
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
uint64_t LockedBytes=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
CpuBytes+=AccCache.bytes;
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock
<< "\t accLock " << AccCache.accLock
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
}
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
void MemoryManager::PrintState(void* _CpuPtr) void MemoryManager::PrintState(void* _CpuPtr)
{ {
@ -495,8 +586,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
if ( AccCache.state==EvictNext) str = std::string("EvictNext"); if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl; std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock << "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock << "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl; << "\t" << AccCache.LRU_valid<<std::endl;

View File

@ -12,7 +12,10 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer; uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
void MemoryManager::Audit(std::string s){};
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;} int MemoryManager::isOpen (void* CpuPtr) { return 0;}
@ -21,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr)
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl; std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
}; };
void MemoryManager::Print(void){}; void MemoryManager::Print(void){};
void MemoryManager::PrintAll(void){};
void MemoryManager::NotifyDeletion(void *ptr){}; void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -131,7 +131,7 @@ public:
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o; scalar_type * ptr = (scalar_type *)& o; // Safe alias
GlobalSumVector(ptr,words); GlobalSumVector(ptr,words);
} }
@ -155,7 +155,7 @@ public:
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); int xbytes,int rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);

View File

@ -343,7 +343,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir); double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
@ -353,7 +353,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
int bytes,int dir) int xbytes,int rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@ -375,39 +375,31 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=rbytes;
} }
} }
if (dox) { if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=xbytes;
} else { } else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
} }
} }
/* if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
* this->StencilSendToRecvFromComplete(list,dir);
* list.resize(0);
* }
*/
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
acceleratorCopySynchronise();
StencilBarrier();// Synch shared memory on a single nodes
int nreq=list.size(); int nreq=list.size();
if (nreq==0) return; if (nreq==0) return;

View File

@ -126,7 +126,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int bytes, int dir) int xbytes,int rbytes, int dir)
{ {
return 2.0*bytes; return 2.0*bytes;
} }

View File

@ -29,6 +29,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#include <syscall.h>
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
@ -36,10 +37,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#endif #endif
#ifdef GRID_SYCl #ifdef GRID_SYCL
#define GRID_SYCL_LEVEL_ZERO_IPC
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: " #define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/

View File

@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
} }
} }
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
#endif
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
#endif
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){

File diff suppressed because it is too large Load Diff

View File

@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; // typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd(); const int Nsimd = vobj::vector_type::Nsimd();

View File

@ -291,8 +291,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead); auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
@ -306,8 +306,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead); auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });

View File

@ -32,7 +32,6 @@ template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -82,7 +81,6 @@ template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -130,7 +128,6 @@ template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid(); GridBase *FullGrid = lhs.Grid();

View File

@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid(); GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@ -136,9 +133,6 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid(); GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@ -179,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l[odx]; const vector_type *vp = (const vector_type *) &l[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd]; pt[w] = getlane(vp[w],idx);
} }
return; return;
@ -216,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l[odx]; vector_type * vp = (vector_type *)&l[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; putlane(vp[w],pt[w],idx);
} }
return; return;
}; };

View File

@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_CUDA)||defined(GRID_HIP) #if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h> #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -91,10 +94,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum;
typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
/* /*
Threaded max, don't use for now Threaded max, don't use for now
@ -127,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sum_gpu(arg,osites); return sum_gpu(arg,osites);
#else #else
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
@ -136,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj> template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu(arg,osites); return sumD_gpu(arg,osites);
#else #else
return sumD_cpu(arg,osites); return sumD_cpu(arg,osites);
@ -145,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
template<class vobj> template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu_large(arg,osites); return sumD_gpu_large(arg,osites);
#else #else
return sumD_cpu(arg,osites); return sumD_cpu(arg,osites);
@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites); #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
return sum_gpu(&arg_v[0],osites);
#else #else
autoView(arg_v, arg, CpuRead); autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites(); return sum_cpu(&arg_v[0],osites);
auto ssum= sum_cpu(&arg_v[0],osites);
#endif #endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto ssum = rankSum(arg);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead); autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites); return sum_gpu_large(&arg_v[0],osites);
#else #else
autoView(arg_v, arg, CpuRead); autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites); return sum_cpu(&arg_v[0],osites);
#endif #endif
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
auto ssum = rankSumLarge(arg);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@ -222,7 +233,6 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
template<class vobj> template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{ {
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
ComplexD nrm; ComplexD nrm;
@ -236,11 +246,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{ {
autoView( left_v , left, AcceleratorRead); autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead); autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance... // GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss); auto x_l = left_v(ss);
@ -299,7 +308,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
conformable(z,x); conformable(z,x);
conformable(x,y); conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
// typedef typename vobj::vector_typeD vector_type; // typedef typename vobj::vector_typeD vector_type;
RealD nrm; RealD nrm;
@ -344,7 +352,6 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
{ {
conformable(left,right); conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2); Vector<ComplexD> tmp(2);
@ -600,7 +607,8 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
template<class vobj> template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0) int orthogdim,RealD scale=1.0)
{ {
// perhaps easier to just promote A to a field and use regular madd
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -631,8 +639,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
for(int l=0;l<Nsimd;l++){ for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l); grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd; int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av; av.putlane(scalar_type(a[ldx])*zscale,l);
as[l] = scalar_type(a[ldx])*zscale;
} }
tensor_reduced at; at=av; tensor_reduced at; at=av;
@ -672,7 +679,6 @@ template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -726,7 +732,6 @@ template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -780,7 +785,6 @@ template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid(); GridBase *FullGrid = lhs.Grid();

View File

@ -211,13 +211,25 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
assert(ok); assert(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks); Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
auto result = buffer_v[0]; result = *buffer_v;
#endif
return result; return result;
} }
@ -250,8 +262,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{ {
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj; typedef typename vobj::scalar_objectD sobj;
sobj ret; sobj ret;

View File

@ -0,0 +1,125 @@
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity);
sobj ret ;
Integer nsimd= vobj::Nsimd();
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
return sumD_gpu_tensor(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@ -440,6 +440,7 @@ public:
_grid->GlobalCoorToGlobalIndex(gcoor,gidx); _grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
assert(rank == _grid->ThisRank() ); assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx); int l_idx=generator_idx(o_idx,i_idx);

View File

@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
#endif #endif
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) { accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v); precisionChange(out,in);
} }
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); precisionChange(out,in);
} }
template<typename T1,typename T2> template<typename T1,typename T2>
@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
} }
} }
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
#endif #endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local // Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj> template<class vobj,class vvobj>
@ -677,10 +726,10 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
scalar_type * fp = (scalar_type *)&f_v[odx_f]; vector_type * fp = (vector_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t]; vector_type * tp = (vector_type *)&t_v[odx_t];
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke tp[w].putlane(fp[w].getlane(idx_f),idx_t);
} }
} }
}); });
@ -1080,9 +1129,27 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}); });
} }
//Convert a Lattice from one precision to another //Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
typedef typename VobjOut::vector_type Vout;
typedef typename VobjIn::vector_type Vin;
const int N = sizeof(VobjOut)/sizeof(Vout);
conformable(out.Grid(),in.Grid());
out.Checkerboard() = in.Checkerboard();
int nsimd = out.Grid()->Nsimd();
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(idx,out.Grid()->oSites(),1,{
Vout *vout = (Vout *)&out_v[idx];
Vin *vin = (Vin *)&in_v[idx];
precisionChange(vout,vin,N);
});
}
//Convert a Lattice from one precision to another (original, slow implementation)
template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{ {
assert(out.Grid()->Nd() == in.Grid()->Nd()); assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){ for(int d=0;d<out.Grid()->Nd();d++){
@ -1097,7 +1164,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
int ndim = out.Grid()->Nd(); int ndim = out.Grid()->Nd();
int out_nsimd = out_grid->Nsimd(); int out_nsimd = out_grid->Nsimd();
int in_nsimd = in_grid->Nsimd();
std::vector<Coordinate > out_icoor(out_nsimd); std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){ for(int lane=0; lane < out_nsimd; lane++){
@ -1128,6 +1195,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
}); });
} }
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
//maintain grids for checking
GridBase* _out_grid;
GridBase* _in_grid;
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
int in_oidx = 0, in_lane = 0;
for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
}
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
void checkGrids(GridBase* out, GridBase* in) const{
conformable(out, _out_grid);
conformable(in, _in_grid);
}
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
template<class VobjOut, class VobjIn>
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
if(out.Grid() == in.Grid()){
precisionChangeFast(out,in);
return 1;
}else{
return 0;
}
}
template<class VobjOut, class VobjIn>
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
return 0;
}
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
//which contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
if(_precisionChangeFastWrap(out,in,0)) return;
static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
workspace.checkGrids(out.Grid(),in.Grid());
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
if(_precisionChangeFastWrap(out,in,0)) return;
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
precisionChange(out, in, workspace);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Communicate between grids // Communicate between grids
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View File

@ -66,6 +66,7 @@ GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW"); GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL"); GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL"); GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
@ -77,7 +78,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(1); GridLogError.Active(1);
GridLogWarning.Active(0); GridLogWarning.Active(0);
GridLogMessage.Active(1); // at least the messages should be always on GridLogMessage.Active(1); // at least the messages should be always on
GridLogMemory.Active(0); // at least the messages should be always on GridLogMemory.Active(0);
GridLogTracing.Active(0);
GridLogIterative.Active(0); GridLogIterative.Active(0);
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
@ -87,6 +89,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogHMC.Active(1); GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) { for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1); if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
@ -94,8 +97,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
} }
} }

View File

@ -186,6 +186,7 @@ extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC; extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory; extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
extern Colours GridLogColours; extern Colours GridLogColours;
std::string demangle(const char* name) ; std::string demangle(const char* name) ;

View File

@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PERFCOUNT_H #ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H #define GRID_PERFCOUNT_H
#ifndef __SSC_START
#define __SSC_START
#define __SSC_STOP
#endif
#include <sys/time.h> #include <sys/time.h>
#include <ctime> #include <ctime>
#include <chrono> #include <chrono>

View File

@ -1,4 +1,7 @@
#pragma once #pragma once
NAMESPACE_BEGIN(Grid);
#ifdef GRID_TRACING_NVTX #ifdef GRID_TRACING_NVTX
#include <nvToolsExt.h> #include <nvToolsExt.h>
class GridTracer { class GridTracer {
@ -64,3 +67,4 @@ inline void traceStop(int ID) { }
#else #else
#define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name); #define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
#endif #endif
NAMESPACE_END(Grid);

View File

@ -16,7 +16,7 @@
#ifdef __NVCC__ #ifdef __NVCC__
#pragma push #pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5) #ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning" #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else #else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning" #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"

View File

@ -126,6 +126,7 @@ typedef iSpinMatrix<ComplexD > SpinMatrixD;
typedef iSpinMatrix<vComplex > vSpinMatrix; typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF; typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD; typedef iSpinMatrix<vComplexD> vSpinMatrixD;
typedef iSpinMatrix<vComplexD2> vSpinMatrixD2;
// Colour Matrix // Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix; typedef iColourMatrix<Complex > ColourMatrix;
@ -135,6 +136,7 @@ typedef iColourMatrix<ComplexD > ColourMatrixD;
typedef iColourMatrix<vComplex > vColourMatrix; typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF; typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD; typedef iColourMatrix<vComplexD> vColourMatrixD;
typedef iColourMatrix<vComplexD2> vColourMatrixD2;
// SpinColour matrix // SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix; typedef iSpinColourMatrix<Complex > SpinColourMatrix;
@ -144,6 +146,7 @@ typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix; typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF; typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD; typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
typedef iSpinColourMatrix<vComplexD2> vSpinColourMatrixD2;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
@ -153,6 +156,7 @@ typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplexD2> vSpinColourSpinColourMatrixD2;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
@ -162,33 +166,37 @@ typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplexD2> vSpinColourSpinColourMatrixD2;
// LorentzColour // LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix; typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF; typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD; typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix; typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF; typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD; typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
// DoubleStored gauge field // DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplexD2> vDoubleStoredColourMatrixD2;
//G-parity flavour matrix //G-parity flavour matrix
typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix; typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF; typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD; typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix; typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF; typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD; typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplexD2> vGparityFlavourMatrixD2;
// Spin vector // Spin vector
@ -199,6 +207,7 @@ typedef iSpinVector<ComplexD> SpinVectorD;
typedef iSpinVector<vComplex > vSpinVector; typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF; typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD; typedef iSpinVector<vComplexD> vSpinVectorD;
typedef iSpinVector<vComplexD2> vSpinVectorD2;
// Colour vector // Colour vector
typedef iColourVector<Complex > ColourVector; typedef iColourVector<Complex > ColourVector;
@ -208,6 +217,7 @@ typedef iColourVector<ComplexD> ColourVectorD;
typedef iColourVector<vComplex > vColourVector; typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF; typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD; typedef iColourVector<vComplexD> vColourVectorD;
typedef iColourVector<vComplexD2> vColourVectorD2;
// SpinColourVector // SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector; typedef iSpinColourVector<Complex > SpinColourVector;
@ -217,6 +227,7 @@ typedef iSpinColourVector<ComplexD> SpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector; typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF; typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD; typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
typedef iSpinColourVector<vComplexD2> vSpinColourVectorD2;
// HalfSpin vector // HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector; typedef iHalfSpinVector<Complex > HalfSpinVector;
@ -226,15 +237,17 @@ typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector; typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF; typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD; typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
typedef iHalfSpinVector<vComplexD2> vHalfSpinVectorD2;
// HalfSpinColour vector // HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector; typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF; typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD; typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector; typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF; typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD; typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplexD2> vHalfSpinColourVectorD2;
//G-parity flavour vector //G-parity flavour vector
typedef iGparityFlavourVector<Complex > GparityFlavourVector; typedef iGparityFlavourVector<Complex > GparityFlavourVector;
@ -244,7 +257,7 @@ typedef iGparityFlavourVector<ComplexD> GparityFlavourVectorD;
typedef iGparityFlavourVector<vComplex > vGparityFlavourVector; typedef iGparityFlavourVector<vComplex > vGparityFlavourVector;
typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF; typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF;
typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD; typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD;
typedef iGparityFlavourVector<vComplexD2> vGparityFlavourVectorD2;
// singlets // singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
@ -254,6 +267,7 @@ typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tenso
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<vComplexD2> vTComplexD2; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without? typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without? typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
@ -271,47 +285,58 @@ typedef iSinglet<Integer > TInteger;
typedef Lattice<vColourMatrix> LatticeColourMatrix; typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF; typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD; typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
typedef Lattice<vColourMatrixD2> LatticeColourMatrixD2;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix; typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF; typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD; typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinMatrixD2> LatticeSpinMatrixD2;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix; typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF; typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD; typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vSpinColourMatrixD2> LatticeSpinColourMatrixD2;
typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix; typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix;
typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF; typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF;
typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD; typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD;
typedef Lattice<vSpinColourSpinColourMatrixD2> LatticeSpinColourSpinColourMatrixD2;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix; typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF; typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD; typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
// DoubleStored gauge field // DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix; typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF; typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD; typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
typedef Lattice<vDoubleStoredColourMatrixD2> LatticeDoubleStoredColourMatrixD2;
typedef Lattice<vSpinVector> LatticeSpinVector; typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF; typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD; typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vSpinVectorD2> LatticeSpinVectorD2;
typedef Lattice<vColourVector> LatticeColourVector; typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF; typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD; typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vColourVectorD2> LatticeColourVectorD2;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector; typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF; typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD; typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vSpinColourVectorD2> LatticeSpinColourVectorD2;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector; typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF; typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD; typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinVectorD2> LatticeHalfSpinVectorD2;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector; typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF; typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD; typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vHalfSpinColourVectorD2> LatticeHalfSpinColourVectorD2;
typedef Lattice<vTReal> LatticeReal; typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF; typedef Lattice<vTRealF> LatticeRealF;
@ -320,6 +345,7 @@ typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTComplex> LatticeComplex; typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF; typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD; typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTComplexD2> LatticeComplexD2;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where" typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
@ -327,37 +353,42 @@ typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
/////////////////////////////////////////// ///////////////////////////////////////////
// Physical names for things // Physical names for things
/////////////////////////////////////////// ///////////////////////////////////////////
typedef LatticeHalfSpinColourVector LatticeHalfFermion; typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD; typedef LatticeHalfSpinColourVectorD LatticeHalfFermionD;
typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2;
typedef LatticeSpinColourVector LatticeFermion; typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF; typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD; typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourVectorD2 LatticeFermionD2;
typedef LatticeSpinColourMatrix LatticePropagator; typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF; typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD; typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeSpinColourMatrixD2 LatticePropagatorD2;
typedef LatticeLorentzColourMatrix LatticeGaugeField; typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF; typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD; typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeLorentzColourMatrixD2 LatticeGaugeFieldD2;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField; typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF; typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD; typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
typedef LatticeDoubleStoredColourMatrixD2 LatticeDoubledGaugeFieldD2;
template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >; template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
// Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion; typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF; typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD; typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
typedef Lattice<vColourVectorD2> LatticeStaggeredFermionD2;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator; typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF; typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD; typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
typedef Lattice<vColourMatrixD2> LatticeStaggeredPropagatorD2;
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes // Peek and Poke named after physics attributes
@ -476,9 +507,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
// Fermion <-> propagator assignements // Fermion <-> propagator assignements
////////////////////////////////////////////// //////////////////////////////////////////////
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
#define FAST_FERM_TO_PROP
template <class Fimpl> template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{ {
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuWrite);
autoView(f_v,f,CpuRead);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
}}
});
#else
for(int j = 0; j < Ns; ++j) for(int j = 0; j < Ns; ++j)
{ {
auto pjs = peekSpin(p, j, s); auto pjs = peekSpin(p, j, s);
@ -490,12 +532,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
} }
pokeSpin(p, pjs, j, s); pokeSpin(p, pjs, j, s);
} }
#endif
} }
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{ {
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuRead);
autoView(f_v,f,CpuWrite);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
}}
});
#else
for(int j = 0; j < Ns; ++j) for(int j = 0; j < Ns; ++j)
{ {
auto pjs = peekSpin(p, j, s); auto pjs = peekSpin(p, j, s);
@ -507,6 +560,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
} }
pokeSpin(f, fj, j); pokeSpin(f, fj, j);
} }
#endif
} }
////////////////////////////////////////////// //////////////////////////////////////////////

View File

@ -50,19 +50,23 @@ public:
RealD refresh_us; RealD refresh_us;
void reset_timer(void) { void reset_timer(void) {
deriv_us = S_us = refresh_us = 0.0; deriv_us = S_us = refresh_us = 0.0;
deriv_num=0;
deriv_norm_sum = deriv_max_sum=0.0; deriv_norm_sum = deriv_max_sum=0.0;
Fdt_max_sum = Fdt_norm_sum = 0.0; Fdt_max_sum = Fdt_norm_sum = 0.0;
deriv_num=0;
} }
void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) { void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
deriv_max_sum+=max; if ( max > deriv_max_sum ) {
deriv_max_sum=max;
}
deriv_norm_sum+=nrm; deriv_norm_sum+=nrm;
Fdt_max_sum+=Fdt_max; if ( Fdt_max > Fdt_max_sum ) {
Fdt_max_sum=Fdt_max;
}
Fdt_norm_sum+=Fdt_nrm; deriv_num++; Fdt_norm_sum+=Fdt_nrm; deriv_num++;
} }
RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; }; RealD deriv_max_average(void) { return deriv_max_sum; };
RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; }; RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; };
RealD Fdt_max_average(void) { return Fdt_max_sum/deriv_num; }; RealD Fdt_max_average(void) { return Fdt_max_sum; };
RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; }; RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; };
RealD deriv_timer(void) { return deriv_us; }; RealD deriv_timer(void) { return deriv_us; };
RealD S_timer(void) { return S_us; }; RealD S_timer(void) { return S_us; };

View File

@ -34,34 +34,43 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
Coordinate twists; Coordinate twists;
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs //mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
Coordinate dirichlet; // Blocksize of dirichlet BCs Coordinate dirichlet; // Blocksize of dirichlet BCs
GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); }; int partialDirichlet;
GparityWilsonImplParams() : twists(Nd, 0) {
dirichlet.resize(0);
partialDirichlet=0;
};
}; };
struct WilsonImplParams { struct WilsonImplParams {
bool overlapCommsCompute; bool overlapCommsCompute;
Coordinate dirichlet; // Blocksize of dirichlet BCs Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
AcceleratorVector<Real,Nd> twist_n_2pi_L; AcceleratorVector<Real,Nd> twist_n_2pi_L;
AcceleratorVector<Complex,Nd> boundary_phases; AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() { WilsonImplParams() {
dirichlet.resize(0); dirichlet.resize(0);
partialDirichlet=0;
boundary_phases.resize(Nd, 1.0); boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
}; };
WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) { WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
partialDirichlet=0;
dirichlet.resize(0); dirichlet.resize(0);
} }
}; };
struct StaggeredImplParams { struct StaggeredImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
StaggeredImplParams() StaggeredImplParams()
{ {
partialDirichlet=0;
dirichlet.resize(0); dirichlet.resize(0);
}; };
}; };

View File

@ -140,6 +140,7 @@ public:
return NMAX; return NMAX;
} }
static int getNMAX(Lattice<iImplClover<vComplexD2>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);} static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
@ -204,15 +205,18 @@ public:
typedef WilsonCloverHelpers<Impl> Helpers; typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers; typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) { static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
Clover += diag_mass; Clover += diag_mass;
} }
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, static void InvertClover(CloverField& InvClover,
CloverTriangleField& Triangle, const CloverDiagonalField& diagonal,
RealD csw_t, RealD diag_mass) { const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
// Do nothing CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
} }
// TODO: implement Cmunu for better performances with compact layout, but don't do it // TODO: implement Cmunu for better performances with compact layout, but don't do it
@ -237,9 +241,17 @@ public:
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>; template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers; typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) { // Can this be avoided?
// do nothing! static void IdentityTimesC(const CloverField& in, RealD c) {
// mass term is multiplied to exp(Clover) below int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
} }
static int getNMAX(RealD prec, RealD R) { static int getNMAX(RealD prec, RealD R) {
@ -254,175 +266,62 @@ public:
return NMAX; return NMAX;
} }
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);} static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){ static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
typedef iMatrix<ComplexD,6> mat; GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
RealD qn[6]; int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
RealD qnold[6];
RealD p[5];
RealD trA2, trA3, trA4;
mat A2, A3, A4, A5; Clover *= (1.0/diag_mass);
A2 = alpha * alpha * arg * arg;
A3 = alpha * arg * A2;
A4 = A2 * A2;
A5 = A2 * A3;
trA2 = toReal( trace(A2) ); // Taylor expansion, slow but generic
trA3 = toReal( trace(A3) ); // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
trA4 = toReal( trace(A4)); // qN = cN
// qn = cn + qn+1 X
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
p[3] = trA3 / 3.0;
p[4] = 0.5 * trA2;
qnold[0] = cN[Niter];
qnold[1] = 0.0;
qnold[2] = 0.0;
qnold[3] = 0.0;
qnold[4] = 0.0;
qnold[5] = 0.0;
for(int i = Niter-1; i >= 0; i--)
{
qn[0] = p[0] * qnold[5] + cN[i];
qn[1] = p[1] * qnold[5] + qnold[0];
qn[2] = p[2] * qnold[5] + qnold[1];
qn[3] = p[3] * qnold[5] + qnold[2];
qn[4] = p[4] * qnold[5] + qnold[3];
qn[5] = qnold[4];
qnold[0] = qn[0];
qnold[1] = qn[1];
qnold[2] = qn[2];
qnold[3] = qn[3];
qnold[4] = qn[4];
qnold[5] = qn[5];
}
mat unit(1.0);
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
GridBase* grid = Diagonal.Grid();
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
//
// Implementation completely in Daniel's layout
//
// Taylor expansion with Cayley-Hamilton recursion
// underlying Horner scheme as above
std::vector<RealD> cn(NMAX+1); std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0; cn[0] = 1.0;
for (int i=1; i<=NMAX; i++){ for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i); cn[i] = cn[i-1] / RealD(i);
}
// Taken over from Daniel's implementation ExpClover = Zero();
conformable(Diagonal, Triangle); IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
long lsites = grid->lSites(); // prepare inverse
{ CloverInv = (-1.0)*Clover;
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
typedef iMatrix<ComplexD,6> mat;
autoView(diagonal_v, Diagonal, CpuRead); Clover = ExpClover * diag_mass;
autoView(triangle_v, Triangle, CpuRead);
autoView(diagonalExp_v, Diagonal, CpuWrite);
autoView(triangleExp_v, Triangle, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
mat srcCloverOpUL(0.0); // upper left block CloverInv = ExpClover * (1.0/diag_mass);
mat srcCloverOpLR(0.0); // lower right block
mat ExpCloverOp;
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_exp_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_exp_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
int block;
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
// exp(Clover)
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
});
}
Diagonal *= diag_mass;
Triangle *= diag_mass;
} }
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
if (fixedBoundaries)
{
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
else
{
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) { static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0); assert(0);

View File

@ -225,7 +225,7 @@ public:
RealD csw_t; RealD csw_t;
RealD cF; RealD cF;
bool open_boundaries; bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd; CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd; CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;

View File

@ -0,0 +1,291 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DWFSlow.h
Copyright (C) 2022
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template <class Impl>
class DWFSlowFermion : public FermionOperator<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid4; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
virtual void M(const FermionField &in, FermionField &out)
{
FermionField tmp(_grid);
out = (5.0 - M5) * in;
Dhop(in,tmp,DaggerNo);
out = out + tmp;
}
virtual void Mdag(const FermionField &in, FermionField &out)
{
FermionField tmp(_grid);
out = (5.0 - M5) * in;
Dhop(in,tmp,DaggerYes);
out = out + tmp;
};
/////////////////////////////////////////////////////////
// half checkerboard operations 5D redblack so just site identiy
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out)
{
if ( in.Checkerboard() == Odd ) {
this->DhopEO(in,out,DaggerNo);
} else {
this->DhopOE(in,out,DaggerNo);
}
}
void MeooeDag(const FermionField &in, FermionField &out)
{
if ( in.Checkerboard() == Odd ) {
this->DhopEO(in,out,DaggerYes);
} else {
this->DhopOE(in,out,DaggerYes);
}
};
// allow override for twisted mass and clover
virtual void Mooee(const FermionField &in, FermionField &out)
{
out = (5.0 - M5) * in;
}
virtual void MooeeDag(const FermionField &in, FermionField &out)
{
out = (5.0 - M5) * in;
}
virtual void MooeeInv(const FermionField &in, FermionField &out)
{
out = (1.0/(5.0 - M5)) * in;
};
virtual void MooeeInvDag(const FermionField &in, FermionField &out)
{
out = (1.0/(5.0 - M5)) * in;
};
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) {} ;
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) { assert(0);};
void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
Dhop5(in,out,MassField,MassField,dag );
for(int mu=0;mu<4;mu++){
DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag ); out = out + tmp;
}
};
void DhopOE(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
assert(in.Checkerboard()==Even);
Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
for(int mu=0;mu<4;mu++){
DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag ); out = out + tmp;
}
};
void DhopEO(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
assert(in.Checkerboard()==Odd);
Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );
for(int mu=0;mu<4;mu++){
DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag ); out = out + tmp;
}
};
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
void MdirAll(const FermionField &in, std::vector<FermionField> &out) { assert(0);};
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
void DhopDirAll(const FermionField &in, std::vector<FermionField> &out) { assert(0);};
void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
{
RealD sgn= 1.0;
if (dag ) sgn=-1.0;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
// mass is 1,1,1,1,-m has to multiply the round the world term
FermionField tmp (in.Grid());
tmp = U5e * Cshift(in,mu+1,1);
out = tmp - Gamma(Gmu[mu])*tmp*sgn;
tmp = Cshift(adj(U5o)*in,mu+1,-1);
out = out + tmp + Gamma(Gmu[mu])*tmp*sgn;
out = -0.5*out;
};
void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag)
{
// Mass term.... must multiple the round world with mass = 1,1,1,1, -m
RealD sgn= 1.0;
if (dag ) sgn=-1.0;
Gamma G5(Gamma::Algebra::Gamma5);
FermionField tmp (in.Grid());
tmp = massE*Cshift(in,0,1);
out = tmp - G5*tmp*sgn;
tmp = Cshift(massO*in,0,-1);
out = out + tmp + G5*tmp*sgn;
out = -0.5*out;
};
// Constructor
DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5)
:
_grid(&Fgrid),
_cbgrid(&Hgrid),
_grid4(_Umu.Grid()),
Umu(Nd,&Fgrid),
UmuEven(Nd,&Hgrid),
UmuOdd(Nd,&Hgrid),
MassField(&Fgrid),
MassFieldEven(&Hgrid),
MassFieldOdd(&Hgrid),
M5(_M5),
mass(_mass),
_tmp(&Hgrid)
{
Ls=Fgrid._fdimensions[0];
ImportGauge(_Umu);
typedef typename FermionField::scalar_type scalar;
Lattice<iScalar<vInteger> > coor(&Fgrid);
LatticeCoordinate(coor, 0); // Scoor
ComplexField one(&Fgrid);
MassField =scalar(-mass);
one =scalar(1.0);
MassField =where(coor==Integer(Ls-1),MassField,one);
for(int mu=0;mu<Nd;mu++){
pickCheckerboard(Even,UmuEven[mu],Umu[mu]);
pickCheckerboard(Odd ,UmuOdd[mu],Umu[mu]);
}
pickCheckerboard(Even,MassFieldEven,MassField);
pickCheckerboard(Odd ,MassFieldOdd,MassField);
}
// DoubleStore impl dependent
void ImportGauge(const GaugeField &_Umu4)
{
GaugeLinkField U4(_grid4);
for(int mu=0;mu<Nd;mu++){
U4 = PeekIndex<LorentzIndex>(_Umu4, mu);
for(int s=0;s<this->Ls;s++){
InsertSlice(U4,Umu[mu],s,0);
}
}
}
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
virtual RealD Mass(void) { return mass; }
virtual int isTrivialEE(void) { return 1; };
RealD mass;
RealD M5;
int Ls;
GridBase *_grid4;
GridBase *_grid;
GridBase *_cbgrid4;
GridBase *_cbgrid;
// Copy of the gauge field , with even and odd subsets
std::vector<GaugeLinkField> Umu;
std::vector<GaugeLinkField> UmuEven;
std::vector<GaugeLinkField> UmuOdd;
ComplexField MassField;
ComplexField MassFieldEven;
ComplexField MassFieldOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu){}
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx){}
};
typedef DWFSlowFermion<WilsonImplF> DWFSlowFermionF;
typedef DWFSlowFermion<WilsonImplD> DWFSlowFermionD;
NAMESPACE_END(Grid);

View File

@ -47,6 +47,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
//////////////////////////////////////////// ////////////////////////////////////////////
// Fermion operators / actions // Fermion operators / actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/fermion/DWFSlow.h> // Slow DWF
#include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like #include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like
NAMESPACE_CHECK(Wilson); NAMESPACE_CHECK(Wilson);
@ -112,28 +113,21 @@ NAMESPACE_CHECK(DWFutils);
// Cayley 5d // Cayley 5d
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
typedef WilsonFermion<WilsonImplR> WilsonFermionR; typedef WilsonFermion<WilsonImplD2> WilsonFermionD2;
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF; typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD; typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF; typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD; typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF; typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD; typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
// Twisted mass fermion // Twisted mass fermion
typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR; typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF; typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD; typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@ -141,23 +135,20 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>; template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>; template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
typedef WilsonClover<WilsonImplR> WilsonCloverFermionR; typedef WilsonClover<WilsonImplD2> WilsonCloverFermionD2;
typedef WilsonClover<WilsonImplF> WilsonCloverFermionF; typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
typedef WilsonClover<WilsonImplD> WilsonCloverFermionD; typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR; typedef WilsonExpClover<WilsonImplD2> WilsonExpCloverFermionD2;
typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF; typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD; typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF; typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD; typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF; typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD; typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF; typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD; typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
@ -165,161 +156,108 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>; template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>; template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR; typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF; typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD; typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR; typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF; typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD; typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF; typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD; typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF; typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD; typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF; typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD; typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions // Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF; typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD; typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
typedef DomainWallFermion<WilsonImplD2> DomainWallFermionD2;
//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL; typedef DomainWallEOFAFermion<WilsonImplD2> DomainWallEOFAFermionD2;
//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF; typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD; typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL; typedef MobiusFermion<WilsonImplD2> MobiusFermionD2;
//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF; typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD; typedef MobiusFermion<WilsonImplD> MobiusFermionD;
//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL; typedef MobiusEOFAFermion<WilsonImplD2> MobiusEOFAFermionD2;
//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF; typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD; typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL; typedef ZMobiusFermion<ZWilsonImplD2> ZMobiusFermionD2;
//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF; typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD; typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL; typedef ScaledShamirFermion<WilsonImplD2> ScaledShamirFermionD2;
//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
// Ls vectorised
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF; typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD; typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR; typedef MobiusZolotarevFermion<WilsonImplD2> MobiusZolotarevFermionD2;
typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF; typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD; typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR; typedef ShamirZolotarevFermion<WilsonImplD2> ShamirZolotarevFermionD2;
typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF; typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD; typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR; typedef OverlapWilsonCayleyTanhFermion<WilsonImplD2> OverlapWilsonCayleyTanhFermionD2;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF; typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD; typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR; typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD2> OverlapWilsonCayleyZolotarevFermionD2;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF; typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD; typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
// Continued fraction // Continued fraction
typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR; typedef OverlapWilsonContFracTanhFermion<WilsonImplD2> OverlapWilsonContFracTanhFermionD2;
typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF; typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD; typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR; typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD2> OverlapWilsonContFracZolotarevFermionD2;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF; typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD; typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
// Partial fraction // Partial fraction
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR; typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD2> OverlapWilsonPartialFractionTanhFermionD2;
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF; typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD; typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR; typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD2> OverlapWilsonPartialFractionZolotarevFermionD2;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF; typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD; typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
// Gparity cases; partial list until tested // Gparity cases; partial list until tested
typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF; typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD; typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
//typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
//typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
//typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF; typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD; typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL; typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionD2;
//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF; typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD; typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL; typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionD2;
//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF; typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD; typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL; typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionD2;
//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF; typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL; typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionD2;
//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF; typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD; typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF; typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD; typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

View File

@ -32,17 +32,218 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////
// Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF
///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=2;
#define DWF_COMPRESS
class FaceGatherPartialDWF
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
// Shrinks local and remote comms buffers
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
int vol=table.size()/Ls;
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
Integer i=idx/Ls;
Integer s=idx%Ls;
Integer sc=depth+s-(Ls-depth);
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
});
rhs_v.ViewClose();
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
auto Ls = dd.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
// Just pass in the Grid
auto kp = dd.kernel_p;
auto mp = dd.mpi_p;
int size= dd.buffer_size;
int vol= size/Ls;
accelerator_forNB(o,size,1,{
int idx=o/Ls;
int s=o%Ls;
if ( s < depth ) {
int oo=s*vol+idx;
kp[o]=mp[oo];
} else if ( s >= Ls-depth ) {
int sc = depth + s - (Ls-depth);
int oo=sc*vol+idx;
kp[o]=mp[oo];
} else {
kp[o] = Zero();//fill rest with zero if partial dirichlet
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Need to gather *interior portions* for ALL s-slices in simd directions
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
// insertion of zeroes...
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
int nnum=num/Ls;
accelerator_forNB(j, num, vobj::Nsimd(), {
// Reorders both local and remote comms buffers
//
int s = j % Ls;
int sp1 = (s+depth)%Ls; // peri incremented s slice
int hxyz= j/Ls;
int xyz0= hxyz*2; // xyzt part of coor
int xyz1= hxyz*2+1;
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
compress.CompressExchange(p0[jj],p1[jj],
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
rhs_v[so+tp[kk1 ].second],
type);
});
rhs_v.ViewClose();
}
// Merge routine is for SIMD faces
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
auto Ls = mm.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
auto mp = &mm.mpointer[0];
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
auto vp1= &mm.vpointers[1][0];
auto type= mm.type;
int nnum = num/Ls;
accelerator_forNB(o,num,Merger::Nsimd,{
int s=o%Ls;
int hxyz=o/Ls; // xyzt related component
int xyz0=hxyz*2;
int xyz1=hxyz*2+1;
int sp = (s+depth)%Ls;
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int oo0= s+xyz0*Ls;
int oo1= s+xyz1*Ls;
// same ss0, ss1 pair goes to new layout
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
});
}
};
class FaceGatherDWFMixedBCs
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) {return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
}
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
int partial = mm.partial;
// std::cout << " merge DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
else FaceGatherSimple::MergeFace(decompress,mm);
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
int partial = dd.partial;
// std::cout << " decompress DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
else FaceGatherSimple::DecompressFace(decompress,dd);
}
};
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// optimised versions supporting half precision too // optimised versions supporting half precision too??? Deprecate
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
class WilsonCompressorTemplate;
//Could make FaceGather a template param, but then behaviour is runtime not compile time
template<class _HCspinor,class _Hspinor,class _Spinor, class projector> template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type > // : public FaceGatherSimple
{ {
public: public:
@ -79,172 +280,81 @@ public:
/*****************************************************/ /*****************************************************/
/* Exchange includes precision change if mpi data is not same */ /* Exchange includes precision change if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Exchange(SiteHalfSpinor *mp, accelerator_inline void Exchange(SiteHalfSpinor &mp0,
const SiteHalfSpinor * __restrict__ vp0, SiteHalfSpinor &mp1,
const SiteHalfSpinor * __restrict__ vp1, const SiteHalfSpinor & vp0,
Integer type,Integer o) const { const SiteHalfSpinor & vp1,
Integer type) const {
#ifdef GRID_SIMT #ifdef GRID_SIMT
exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); exchangeSIMT(mp0,mp1,vp0,vp1,type);
#else #else
SiteHalfSpinor tmp1; SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2; SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type); exchange(tmp1,tmp2,vp0,vp1,type);
vstream(mp[2*o ],tmp1); vstream(mp0,tmp1);
vstream(mp[2*o+1],tmp2); vstream(mp1,tmp2);
#endif #endif
} }
/*****************************************************/ /*****************************************************/
/* Have a decompression step if mpi data is not same */ /* Have a decompression step if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, accelerator_inline void Decompress(SiteHalfSpinor &out,
SiteHalfSpinor * __restrict__ in, Integer o) const { SiteHalfSpinor &in) const {
assert(0); out = in;
} }
/*****************************************************/ /*****************************************************/
/* Compress Exchange */ /* Compress Exchange */
/*****************************************************/ /*****************************************************/
accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, accelerator_inline void CompressExchange(SiteHalfSpinor &out0,
SiteHalfSpinor * __restrict__ out1, SiteHalfSpinor &out1,
const SiteSpinor * __restrict__ in, const SiteSpinor &in0,
Integer j,Integer k, Integer m,Integer type) const const SiteSpinor &in1,
Integer type) const
{ {
#ifdef GRID_SIMT #ifdef GRID_SIMT
typedef SiteSpinor vobj; typedef SiteSpinor vobj;
typedef SiteHalfSpinor hvobj; typedef SiteHalfSpinor hvobj;
typedef decltype(coalescedRead(*in)) sobj; typedef decltype(coalescedRead(in0)) sobj;
typedef decltype(coalescedRead(*out0)) hsobj; typedef decltype(coalescedRead(out0)) hsobj;
unsigned int Nsimd = vobj::Nsimd(); constexpr unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1); unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd); int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one int j1 = lane |(mask) ; // inner coor one
const vobj *vp0 = &in[k]; const vobj *vp0 = &in0;
const vobj *vp1 = &in[m]; const vobj *vp1 = &in1;
const vobj *vp = (lane&mask) ? vp1:vp0; const vobj *vp = (lane&mask) ? vp1:vp0;
auto sa = coalescedRead(*vp,j0); auto sa = coalescedRead(*vp,j0);
auto sb = coalescedRead(*vp,j1); auto sb = coalescedRead(*vp,j1);
hsobj psa, psb; hsobj psa, psb;
projector::Proj(psa,sa,mu,dag); projector::Proj(psa,sa,mu,dag);
projector::Proj(psb,sb,mu,dag); projector::Proj(psb,sb,mu,dag);
coalescedWrite(out0[j],psa); coalescedWrite(out0,psa);
coalescedWrite(out1[j],psb); coalescedWrite(out1,psb);
#else #else
SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp1, temp2;
SiteHalfSpinor temp3, temp4; SiteHalfSpinor temp3, temp4;
projector::Proj(temp1,in[k],mu,dag); projector::Proj(temp1,in0,mu,dag);
projector::Proj(temp2,in[m],mu,dag); projector::Proj(temp2,in1,mu,dag);
exchange(temp3,temp4,temp1,temp2,type); exchange(temp3,temp4,temp1,temp2,type);
vstream(out0[j],temp3); vstream(out0,temp3);
vstream(out1[j],temp4); vstream(out1,temp4);
#endif #endif
} }
/*****************************************************/ /*****************************************************/
/* Pass the info to the stencil */ /* Pass the info to the stencil */
/*****************************************************/ /*****************************************************/
accelerator_inline bool DecompressionStep(void) const { return false; } accelerator_inline bool DecompressionStep(void) const {
return false;
}
}; };
#if 0
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
{
public:
int mu,dag;
void Point(int p) { mu=p; };
WilsonCompressorTemplate(int _dag=0){
dag = _dag;
}
typedef _Spinor SiteSpinor;
typedef _Hspinor SiteHalfSpinor;
typedef _HCspinor SiteHalfCommSpinor;
typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) const {
return sizeof(SiteHalfCommSpinor);
}
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
SiteHalfSpinor hsp;
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
projector::Proj(hsp,in,mu,dag);
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
}
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
#ifdef GRID_SIMT
typedef decltype(coalescedRead(buf)) sobj;
sobj sp;
auto sin = coalescedRead(in);
projector::Proj(sp,sin,mu,dag);
coalescedWrite(buf,sp);
#else
projector::Proj(buf,in,mu,dag);
#endif
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
/*****************************************************/
accelerator_inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1,
Integer type,Integer o) const {
SiteHalfSpinor vt0,vt1;
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
}
/*****************************************************/
/* Compress Exchange */
/*****************************************************/
accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1,
const SiteSpinor *in,
Integer j,Integer k, Integer m,Integer type) const {
SiteHalfSpinor temp1, temp2,temp3,temp4;
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in[m],mu,dag);
exchange(temp3,temp4,temp1,temp2,type);
precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
}
/*****************************************************/
/* Pass the info to the stencil */
/*****************************************************/
accelerator_inline bool DecompressionStep(void) const { return true; }
};
#endif
#define DECLARE_PROJ(Projector,Compressor,spProj) \ #define DECLARE_PROJ(Projector,Compressor,spProj) \
class Projector { \ class Projector { \
public: \ public: \
@ -374,24 +484,26 @@ public:
int dag = compress.dag; int dag = compress.dag;
int face_idx=0; int face_idx=0;
#define vet_same_node(a,b) \
{ auto tmp = b; }
if ( dag ) { if ( dag ) {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else { } else {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
} }
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid);
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal > template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > { class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public: public:
static const int Dimension = Representation::Dimension; static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental; static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false; static const bool LsVectorised=false;
@ -242,19 +242,13 @@ public:
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double
typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > WilsonImplD2; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffComplex > ZWilsonImplD2; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float
typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD; // Double typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD; // Double

View File

@ -52,13 +52,6 @@ public:
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector; typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public: public:
#ifdef GRID_SYCL
#define SYCL_HACK
#endif
#ifdef SYCL_HACK
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
#endif
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,

View File

@ -905,88 +905,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
#undef TopRowWithSource #undef TopRowWithSource
#if 0
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
Vector<iSinglet<Simd> > & Matp,
Vector<iSinglet<Simd> > & Matm)
{
int Ls=this->Ls;
GridBase *grid = this->FermionRedBlackGrid();
int LLs = grid->_rdimensions[0];
if ( LLs == Ls ) {
return; // Not vectorised in 5th direction
}
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = bee[s];
Pminus(s,s)= bee[s];
}
for(int s=0;s<Ls-1;s++){
Pminus(s,s+1) = -cee[s];
}
for(int s=0;s<Ls-1;s++){
Pplus(s+1,s) = -cee[s+1];
}
Pplus (0,Ls-1) = mass*cee[0];
Pminus(Ls-1,0) = mass*cee[Ls-1];
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
if ( inv ) {
PplusMat =Pplus.inverse();
PminusMat=Pminus.inverse();
} else {
PplusMat =Pplus;
PminusMat=Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd=Simd::Nsimd();
Matp.resize(Ls*LLs);
Matm.resize(Ls*LLs);
for(int s2=0;s2<Ls;s2++){
for(int s1=0;s1<LLs;s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type *)&Vp;
scalar_type *sm = (scalar_type *)&Vm;
for(int l=0;l<Nsimd;l++){
if ( switcheroo<Coeff_t>::iscomplex() ) {
sp[l] = PplusMat (l*istride+s1*ostride,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
} else {
// if real
scalar_type tmp;
tmp = PplusMat (l*istride+s1*ostride,s2);
sp[l] = scalar_type(tmp.real(),tmp.real());
tmp = PminusMat(l*istride+s1*ostride,s2);
sm[l] = scalar_type(tmp.real(),tmp.real());
}
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}}
}
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
, csw_r(_csw_r) , csw_r(_csw_r)
, csw_t(_csw_t) , csw_t(_csw_t)
, cF(_cF) , cF(_cF)
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid) , Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid) , DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid) , DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
csw_r /= clover_anisotropy.xi_0; csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu); ImportGauge(_Umu);
if (open_boundaries) { if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even; this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd; this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag); WilsonBase::Dhop(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag); WilsonBase::DhopOE(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag); WilsonBase::DhopEO(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp); WilsonBase::DhopDir(in, out, dir, disp);
if(this->open_boundaries) ApplyBoundaryMask(out); if(this->fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out); WilsonBase::DhopDirAll(in, out);
if(this->open_boundaries) { if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o); for(auto& o : out) ApplyBoundaryMask(o);
} }
} }
@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp); Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp); axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp); MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp); axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out); WilsonBase::Meooe(in, out);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out); WilsonBase::MeooeDag(in, out);
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
} else { } else {
MooeeInternal(in, out, Diagonal, Triangle); MooeeInternal(in, out, Diagonal, Triangle);
} }
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
} else { } else {
MooeeInternal(in, out, DiagonalInv, TriangleInv); MooeeInternal(in, out, DiagonalInv, TriangleInv);
} }
if(open_boundaries) ApplyBoundaryMask(out); if(fixedBoundaries) ApplyBoundaryMask(out);
} }
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
template<class Impl, class CloverHelpers> template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) { void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!open_boundaries); // TODO check for changes required for open bc assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term // NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid()); conformable(X.Grid(), Y.Grid());
@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
GridBase* grid = _Umu.Grid(); GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid); CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu // Compute the field strength terms mu>nu
double t2 = usecond(); double t2 = usecond();
@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Handle mass term based on clover policy
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass); // Instantiate the clover term
// - In case of the standard clover the mass term is added
// Convert the data layout of the clover term // - In case of the exponential clover the clover term is exponentiated
double t4 = usecond(); double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Exponentiate the clover (nothing happens in case of the standard clover) // Modify the clover term at the temporal boundaries in case of open boundary conditions
double t5 = usecond();
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
// Possible modify the boundary values
double t6 = usecond(); double t6 = usecond();
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions) // Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond(); double t7 = usecond();
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields // Fill the remaining clover fields
double t8 = usecond(); double t8 = usecond();
@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl; std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl; std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl; std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl; std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl; std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl; std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl; std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl; std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl; std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
} }

View File

@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
_tmp(&FiveDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid),
Dirichlet(0) Dirichlet(0)
{ {
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// some assertions // some assertions
assert(FiveDimGrid._ndimension==5); assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4); assert(FourDimGrid._ndimension==4);
@ -96,6 +100,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
Coordinate block = p.dirichlet; Coordinate block = p.dirichlet;
if ( block[0] || block[1] || block[2] || block[3] || block[4] ){ if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
Dirichlet = 1; Dirichlet = 1;
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
Block = block; Block = block;
} }
} else { } else {
@ -137,9 +143,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
StencilEven.BuildSurfaceList(LLs,vol4); StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4);
// std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
// <<" " << StencilEven.surface_list.size()<<std::endl;
} }
template<class Impl> template<class Impl>
@ -148,12 +151,29 @@ void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
GaugeField HUmu(_Umu.Grid()); GaugeField HUmu(_Umu.Grid());
HUmu = _Umu*(-0.5); HUmu = _Umu*(-0.5);
if ( Dirichlet ) { if ( Dirichlet ) {
std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
Coordinate GaugeBlock(Nd); if ( this->Params.partialDirichlet ) {
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1]; std::cout << GridLogMessage << " partialDirichlet BCs " <<Block<<std::endl;
std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl; } else {
DirichletFilter<GaugeField> Filter(GaugeBlock); std::cout << GridLogMessage << " FULL Dirichlet BCs " <<Block<<std::endl;
Filter.applyFilter(HUmu); }
std:: cout << GridLogMessage << "Checking block size multiple of rank boundaries for Dirichlet"<<std::endl;
for(int d=0;d<Nd;d++) {
int GaugeBlock = Block[d+1];
int ldim=GaugeGrid()->LocalDimensions()[d];
if (GaugeBlock) assert( (GaugeBlock%ldim)==0);
}
if (!this->Params.partialDirichlet) {
std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " <<Block<<std::endl;
Coordinate GaugeBlock(Nd);
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
DirichletFilter<GaugeField> Filter(GaugeBlock);
Filter.applyFilter(HUmu);
} else {
std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " NOT filtered gauge field" <<std::endl;
}
} }
Impl::DoubleStore(GaugeGrid(),Umu,HUmu); Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
pickCheckerboard(Even,UmuEven,Umu); pickCheckerboard(Even,UmuEven,Umu);

View File

@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
_tmp(&Hgrid), _tmp(&Hgrid),
anisotropyCoeff(anis) anisotropyCoeff(anis)
{ {
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// Allocate the required comms buffer // Allocate the required comms buffer
ImportGauge(_Umu); ImportGauge(_Umu);
if (anisotropyCoeff.isAnisotropic){ if (anisotropyCoeff.isAnisotropic){

View File

@ -433,11 +433,23 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
}); });
#define ASM_CALL(A) \ #define ASM_CALL(A) \
thread_for( ss, Nsite, { \ thread_for( sss, Nsite, { \
int ss = st.lo->Reorder(sss); \
int sU = ss; \ int sU = ss; \
int sF = ss*Ls; \ int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
}); });
#define ASM_CALL_SLICE(A) \
auto grid = in.Grid() ; \
int nt = grid->LocalDimensions()[4]; \
int nxyz = Nsite/nt ; \
for(int t=0;t<nt;t++){ \
thread_for( sss, nxyz, { \
int ss = t*nxyz+sss; \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});}
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
@ -451,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
@ -466,6 +474,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
@ -491,12 +500,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA

View File

@ -9,6 +9,7 @@ STAG5_IMPL_LIST=""
WILSON_IMPL_LIST=" \ WILSON_IMPL_LIST=" \
WilsonImplF \ WilsonImplF \
WilsonImplD \ WilsonImplD \
WilsonImplD2 \
WilsonAdjImplF \ WilsonAdjImplF \
WilsonAdjImplD \ WilsonAdjImplD \
WilsonTwoIndexSymmetricImplF \ WilsonTwoIndexSymmetricImplF \
@ -25,8 +26,9 @@ COMPACT_WILSON_IMPL_LIST=" \
DWF_IMPL_LIST=" \ DWF_IMPL_LIST=" \
WilsonImplF \ WilsonImplF \
WilsonImplD \ WilsonImplD \
WilsonImplD2 \
ZWilsonImplF \ ZWilsonImplF \
ZWilsonImplD " ZWilsonImplD2 "
GDWF_IMPL_LIST=" \ GDWF_IMPL_LIST=" \
GparityWilsonImplF \ GparityWilsonImplF \

View File

@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid);
typedef Lattice<SiteLink> LinkField; typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field; typedef Lattice<SiteField> Field;
typedef Field ComplexField; typedef LinkField ComplexField;
}; };
typedef QedGImpl<vComplex> QedGImplR; typedef QedGImpl<vComplex> QedGImplR;

View File

@ -59,7 +59,7 @@ NAMESPACE_BEGIN(Grid);
typedef RationalActionParams Params; typedef RationalActionParams Params;
Params param; Params param;
RealD RefreshAction;
//For action evaluation //For action evaluation
MultiShiftFunction ApproxPowerAction ; //rational approx for X^{1/inv_pow} MultiShiftFunction ApproxPowerAction ; //rational approx for X^{1/inv_pow}
MultiShiftFunction ApproxNegPowerAction; //rational approx for X^{-1/inv_pow} MultiShiftFunction ApproxNegPowerAction; //rational approx for X^{-1/inv_pow}
@ -115,6 +115,56 @@ NAMESPACE_BEGIN(Grid);
public: public:
// allow non-uniform tolerances
void SetTolerances(std::vector<RealD> action_tolerance,std::vector<RealD> md_tolerance)
{
assert(action_tolerance.size()==ApproxPowerAction.tolerances.size());
assert( md_tolerance.size()==ApproxPowerMD.tolerances.size());
// Fix up the tolerances
for(int i=0;i<ApproxPowerAction.tolerances.size();i++){
ApproxPowerAction.tolerances[i] = action_tolerance[i];
ApproxNegPowerAction.tolerances[i] = action_tolerance[i];
ApproxHalfPowerAction.tolerances[i] = action_tolerance[i];
ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
}
for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
ApproxPowerMD.tolerances[i] = md_tolerance[i];
ApproxNegPowerMD.tolerances[i] = md_tolerance[i];
ApproxHalfPowerMD.tolerances[i] = md_tolerance[i];
ApproxNegHalfPowerMD.tolerances[i]= md_tolerance[i];
}
// Print out - could deprecate
for(int i=0;i<ApproxPowerMD.tolerances.size();i++) {
std::cout<<GridLogMessage << " ApproxPowerMD shift["<<i<<"] "
<<" pole "<<ApproxPowerMD.poles[i]
<<" residue "<<ApproxPowerMD.residues[i]
<<" tol "<<ApproxPowerMD.tolerances[i]<<std::endl;
}
/*
for(int i=0;i<ApproxNegPowerMD.tolerances.size();i++) {
std::cout<<GridLogMessage << " ApproxNegPowerMD shift["<<i<<"] "
<<" pole "<<ApproxNegPowerMD.poles[i]
<<" residue "<<ApproxNegPowerMD.residues[i]
<<" tol "<<ApproxNegPowerMD.tolerances[i]<<std::endl;
}
for(int i=0;i<ApproxHalfPowerMD.tolerances.size();i++) {
std::cout<<GridLogMessage << " ApproxHalfPowerMD shift["<<i<<"] "
<<" pole "<<ApproxHalfPowerMD.poles[i]
<<" residue "<<ApproxHalfPowerMD.residues[i]
<<" tol "<<ApproxHalfPowerMD.tolerances[i]<<std::endl;
}
for(int i=0;i<ApproxNegHalfPowerMD.tolerances.size();i++) {
std::cout<<GridLogMessage << " ApproxNegHalfPowerMD shift["<<i<<"] "
<<" pole "<<ApproxNegHalfPowerMD.poles[i]
<<" residue "<<ApproxNegHalfPowerMD.residues[i]
<<" tol "<<ApproxNegHalfPowerMD.tolerances[i]<<std::endl;
}
*/
}
GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl> &_NumOp, GeneralEvenOddRatioRationalPseudoFermionAction(FermionOperator<Impl> &_NumOp,
FermionOperator<Impl> &_DenOp, FermionOperator<Impl> &_DenOp,
const Params & p const Params & p
@ -149,6 +199,11 @@ NAMESPACE_BEGIN(Grid);
ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance; ApproxNegHalfPowerMD.tolerances[i] = ApproxHalfPowerMD.tolerances[i] = param.md_tolerance;
} }
std::vector<RealD> action_tolerance(ApproxHalfPowerAction.tolerances.size(),param.action_tolerance);
std::vector<RealD> md_tolerance (ApproxHalfPowerMD.tolerances.size(),param.md_tolerance);
SetTolerances(action_tolerance, md_tolerance);
std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl; std::cout<<GridLogMessage << action_name() << " initialize: complete" << std::endl;
}; };
@ -217,12 +272,19 @@ NAMESPACE_BEGIN(Grid);
assert(NumOp.ConstEE() == 1); assert(NumOp.ConstEE() == 1);
assert(DenOp.ConstEE() == 1); assert(DenOp.ConstEE() == 1);
PhiEven = Zero(); PhiEven = Zero();
std::cout<<GridLogMessage << action_name() << " refresh: starting" << std::endl;
RefreshAction = norm2( etaOdd );
std::cout<<GridLogMessage << action_name() << " refresh: action is " << RefreshAction << std::endl;
}; };
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
virtual RealD Sinitial(const GaugeField &U) {
std::cout << GridLogMessage << "Returning stored two flavour refresh action "<<RefreshAction<<std::endl;
return RefreshAction;
}
virtual RealD S(const GaugeField &U) { virtual RealD S(const GaugeField &U) {
std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl; std::cout<<GridLogMessage << action_name() << " compute action: starting" << std::endl;
ImportGauge(U); ImportGauge(U);

View File

@ -29,6 +29,8 @@
#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -44,7 +46,7 @@ NAMESPACE_BEGIN(Grid);
FermionOperator<ImplD> & NumOpD; FermionOperator<ImplD> & NumOpD;
FermionOperator<ImplD> & DenOpD; FermionOperator<ImplD> & DenOpD;
FermionOperator<ImplF> & NumOpF; FermionOperator<ImplF> & NumOpF;
FermionOperator<ImplF> & DenOpF; FermionOperator<ImplF> & DenOpF;
@ -53,29 +55,49 @@ NAMESPACE_BEGIN(Grid);
//Allow derived classes to override the multishift CG //Allow derived classes to override the multishift CG
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
#if 0
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
msCG(schurOp,in, out);
#else
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
// Action better with higher precision?
ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
msCG(schurOpD, in, out); msCG(schurOpD, in, out);
#endif
} }
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD); SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); SchurDifferentiableOperator<ImplF> schurOpF (numerator ? NumOpF : DenOpF);
ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); FermionFieldD inD(NumOpD.FermionRedBlackGrid());
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
msCG(schurOpD, in, out_elems, out); msCG(schurOpD, in, out_elems, out);
} }
//Allow derived classes to override the gauge import //Allow derived classes to override the gauge import
virtual void ImportGauge(const typename ImplD::GaugeField &Ud){ virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
typename ImplF::GaugeField Uf(NumOpF.GaugeGrid()); typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
typename ImplD::GaugeField Ud2(NumOpD.GaugeGrid());
precisionChange(Uf, Ud); precisionChange(Uf, Ud);
precisionChange(Ud2, Ud);
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
NumOpD.ImportGauge(Ud); NumOpD.ImportGauge(Ud);
DenOpD.ImportGauge(Ud); DenOpD.ImportGauge(Ud);
NumOpF.ImportGauge(Uf); NumOpF.ImportGauge(Uf);
DenOpF.ImportGauge(Uf); DenOpF.ImportGauge(Uf);
NumOpD.ImportGauge(Ud2);
DenOpD.ImportGauge(Ud2);
} }
public: public:
@ -83,7 +105,10 @@ NAMESPACE_BEGIN(Grid);
FermionOperator<ImplF> &_NumOpF, FermionOperator<ImplF> &_DenOpF, FermionOperator<ImplF> &_NumOpF, FermionOperator<ImplF> &_DenOpF,
const RationalActionParams & p, Integer _ReliableUpdateFreq const RationalActionParams & p, Integer _ReliableUpdateFreq
) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p), ) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
ReliableUpdateFreq(_ReliableUpdateFreq), NumOpD(_NumOpD), DenOpD(_DenOpD), NumOpF(_NumOpF), DenOpF(_DenOpF){} ReliableUpdateFreq(_ReliableUpdateFreq),
NumOpD(_NumOpD), DenOpD(_DenOpD),
NumOpF(_NumOpF), DenOpF(_DenOpF)
{}
virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";} virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
}; };

View File

@ -68,7 +68,8 @@ NAMESPACE_BEGIN(Grid);
}; };
template<class Impl,class ImplF> template<class Impl,class ImplF>
class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> { class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
public: public:
typedef OneFlavourRationalParams Params; typedef OneFlavourRationalParams Params;
private: private:

View File

@ -38,7 +38,7 @@ NAMESPACE_BEGIN(Grid);
class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> { class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
private: private:
FermionOperator<Impl> & NumOp;// the basic operator FermionOperator<Impl> & NumOp;// the basic operator
FermionOperator<Impl> & DenOp;// the basic operator FermionOperator<Impl> & DenOp;// the basic operator
@ -50,6 +50,8 @@ NAMESPACE_BEGIN(Grid);
FermionField PhiOdd; // the pseudo fermion field for this trajectory FermionField PhiOdd; // the pseudo fermion field for this trajectory
FermionField PhiEven; // the pseudo fermion field for this trajectory FermionField PhiEven; // the pseudo fermion field for this trajectory
RealD RefreshAction;
public: public:
TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl> &_NumOp, TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl> &_NumOp,
FermionOperator<Impl> &_DenOp, FermionOperator<Impl> &_DenOp,
@ -119,24 +121,38 @@ NAMESPACE_BEGIN(Grid);
NumOp.ImportGauge(U); NumOp.ImportGauge(U);
DenOp.ImportGauge(U); DenOp.ImportGauge(U);
std::cout << " TwoFlavourRefresh: Imported gauge "<<std::endl;
SchurDifferentiableOperator<Impl> Mpc(DenOp); SchurDifferentiableOperator<Impl> Mpc(DenOp);
SchurDifferentiableOperator<Impl> Vpc(NumOp); SchurDifferentiableOperator<Impl> Vpc(NumOp);
std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
// Odd det factors // Odd det factors
Mpc.MpcDag(etaOdd,PhiOdd); Mpc.MpcDag(etaOdd,PhiOdd);
std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
tmp=Zero(); tmp=Zero();
std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
HeatbathSolver(Vpc,PhiOdd,tmp); HeatbathSolver(Vpc,PhiOdd,tmp);
std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
Vpc.Mpc(tmp,PhiOdd); Vpc.Mpc(tmp,PhiOdd);
std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;
// Even det factors // Even det factors
DenOp.MooeeDag(etaEven,tmp); DenOp.MooeeDag(etaEven,tmp);
NumOp.MooeeInvDag(tmp,PhiEven); NumOp.MooeeInvDag(tmp,PhiEven);
std::cout << " TwoFlavourRefresh: Mee "<<std::endl;
RefreshAction = norm2(etaEven)+norm2(etaOdd);
std::cout << " refresh " <<action_name()<< " action "<<RefreshAction<<std::endl;
}; };
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// S = phi^dag V (Mdag M)^-1 Vdag phi // S = phi^dag V (Mdag M)^-1 Vdag phi
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
virtual RealD Sinitial(const GaugeField &U) {
std::cout << GridLogMessage << "Returning stored two flavour refresh action "<<RefreshAction<<std::endl;
return RefreshAction;
}
virtual RealD S(const GaugeField &U) { virtual RealD S(const GaugeField &U) {
NumOp.ImportGauge(U); NumOp.ImportGauge(U);

View File

@ -47,7 +47,7 @@ private:
const unsigned int N = Impl::Group::Dimension; const unsigned int N = Impl::Group::Dimension;
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
typedef CartesianStencil<vobj, vobj,int> Stencil; typedef CartesianStencil<vobj, vobj,DefaultImplParams> Stencil;
SimpleCompressor<vobj> compressor; SimpleCompressor<vobj> compressor;
int npoint = 2 * Ndim; int npoint = 2 * Ndim;
@ -82,7 +82,7 @@ public:
virtual RealD S(const Field &p) virtual RealD S(const Field &p)
{ {
assert(p.Grid()->Nd() == Ndim); assert(p.Grid()->Nd() == Ndim);
static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements,0); static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor); phiStencil.HaloExchange(p, compressor);
Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid()); Field action(p.Grid()), pshift(p.Grid()), phisquared(p.Grid());
phisquared = p * p; phisquared = p * p;
@ -133,7 +133,7 @@ public:
double interm_t = usecond(); double interm_t = usecond();
// move this outside // move this outside
static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements,0); static Stencil phiStencil(p.Grid(), npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor); phiStencil.HaloExchange(p, compressor);
double halo_t = usecond(); double halo_t = usecond();

View File

@ -53,6 +53,7 @@ struct HMCparameters: Serializable {
Integer, Trajectories, /* @brief Number of sweeps in this run */ Integer, Trajectories, /* @brief Number of sweeps in this run */
bool, MetropolisTest, bool, MetropolisTest,
Integer, NoMetropolisUntil, Integer, NoMetropolisUntil,
bool, PerformRandomShift, /* @brief Randomly shift the gauge configuration at the start of a trajectory */
std::string, StartingType, std::string, StartingType,
IntegratorParameters, MD) IntegratorParameters, MD)
@ -63,6 +64,7 @@ struct HMCparameters: Serializable {
StartTrajectory = 0; StartTrajectory = 0;
Trajectories = 10; Trajectories = 10;
StartingType = "HotStart"; StartingType = "HotStart";
PerformRandomShift = true;
///////////////////////////////// /////////////////////////////////
} }
@ -83,6 +85,7 @@ struct HMCparameters: Serializable {
std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n"; std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n"; std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n"; std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
std::cout << GridLogMessage << "[HMC parameters] Doing random shift : " << std::boolalpha << PerformRandomShift << "\n";
std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n"; std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
MD.print_parameters(); MD.print_parameters();
} }
@ -95,6 +98,7 @@ private:
const HMCparameters Params; const HMCparameters Params;
typedef typename IntegratorType::Field Field; typedef typename IntegratorType::Field Field;
typedef typename IntegratorType::FieldImplementation FieldImplementation;
typedef std::vector< HmcObservable<Field> * > ObsListType; typedef std::vector< HmcObservable<Field> * > ObsListType;
//pass these from the resource manager //pass these from the resource manager
@ -138,26 +142,38 @@ private:
GridBase *Grid = U.Grid(); GridBase *Grid = U.Grid();
////////////////////////////////////////////////////////////////////////////////////////////////////// if(Params.PerformRandomShift){
// Mainly for DDHMC perform a random translation of U modulo volume #if 0
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage << "--------------------------------------------------\n"; // Mainly for DDHMC perform a random translation of U modulo volume
std::cout << GridLogMessage << "Random shifting gauge field by ["; //////////////////////////////////////////////////////////////////////////////////////////////////////
for(int d=0;d<Grid->Nd();d++) { std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Random shifting gauge field by [";
int L = Grid->GlobalDimensions()[d]; std::vector<typename FieldImplementation::GaugeLinkField> Umu(Grid->Nd(), U.Grid());
for(int mu=0;mu<Grid->Nd();mu++) Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
RealD rn_uniform; random(sRNG, rn_uniform); for(int d=0;d<Grid->Nd();d++) {
int shift = (int) (rn_uniform*L); int L = Grid->GlobalDimensions()[d];
std::cout << shift; RealD rn_uniform; random(sRNG, rn_uniform);
if(d<Grid->Nd()-1) std::cout <<",";
else std::cout <<"]\n"; int shift = (int) (rn_uniform*L);
std::cout << shift;
if(d<Grid->Nd()-1) std::cout <<",";
else std::cout <<"]\n";
U = Cshift(U,d,shift); //shift all fields together in a way that respects the gauge BCs
for(int mu=0; mu < Grid->Nd(); mu++)
Umu[mu] = FieldImplementation::CshiftLink(Umu[mu],d,shift);
for(int mu=0;mu<Grid->Nd();mu++) PokeIndex<LorentzIndex>(U,Umu[mu],mu);
}
std::cout << GridLogMessage << "--------------------------------------------------\n";
#endif
} }
std::cout << GridLogMessage << "--------------------------------------------------\n";
TheIntegrator.reset_timer(); TheIntegrator.reset_timer();
@ -174,7 +190,7 @@ private:
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage << "--------------------------------------------------\n"; std::cout << GridLogMessage << "--------------------------------------------------\n";
std::cout << GridLogMessage << "Compute initial action"; std::cout << GridLogMessage << "Compute initial action";
RealD H0 = TheIntegrator.S(U); RealD H0 = TheIntegrator.Sinitial(U);
std::cout << GridLogMessage << "--------------------------------------------------\n"; std::cout << GridLogMessage << "--------------------------------------------------\n";
std::streamsize current_precision = std::cout.precision(); std::streamsize current_precision = std::cout.precision();

View File

@ -63,10 +63,10 @@ public:
}; };
/*! @brief Class for Molecular Dynamics management */ /*! @brief Class for Molecular Dynamics management */
template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy> template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
class Integrator { class Integrator {
protected: protected:
typedef FieldImplementation_ FieldImplementation;
typedef typename FieldImplementation::Field MomentaField; //for readability typedef typename FieldImplementation::Field MomentaField; //for readability
typedef typename FieldImplementation::Field Field; typedef typename FieldImplementation::Field Field;
@ -132,10 +132,15 @@ protected:
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
double start_force = usecond(); double start_force = usecond();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
as[level].actions.at(a)->deriv_timer_start(); as[level].actions.at(a)->deriv_timer_start();
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv_timer_stop(); as[level].actions.at(a)->deriv_timer_stop();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
auto name = as[level].actions.at(a)->action_name(); auto name = as[level].actions.at(a)->action_name();
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
@ -145,7 +150,7 @@ protected:
// DumpSliceNorm("force ",force,Nd-1); // DumpSliceNorm("force ",force,Nd-1);
MomFilter->applyFilter(force); MomFilter->applyFilter(force);
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<< std::endl; std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<< std::endl;
DumpSliceNorm("force filtered ",force,Nd-1); DumpSliceNorm("force filtered ",force,Nd-1);
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
@ -156,6 +161,7 @@ protected:
as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] dt : " << ep <<" "<<name<<std::endl;
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl; std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl;
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max : " << force_max <<" "<<name<<std::endl; std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max : " << force_max <<" "<<name<<std::endl;
std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average : " << impulse_abs <<" "<<name<<std::endl; std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average : " << impulse_abs <<" "<<name<<std::endl;
@ -276,6 +282,15 @@ public:
<< as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl; << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
} }
} }
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Dslash counts "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl;
uint64_t full, partial, dirichlet;
DslashGetCounts(dirichlet,partial,full);
std::cout << GridLogMessage << " Full BCs : "<<full<<std::endl;
std::cout << GridLogMessage << " Partial dirichlet BCs : "<<partial<<std::endl;
std::cout << GridLogMessage << " Dirichlet BCs : "<<dirichlet<<std::endl;
std::cout << GridLogMessage << "--------------------------- "<<std::endl; std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Force average size "<<std::endl; std::cout << GridLogMessage << " Force average size "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl; std::cout << GridLogMessage << "------------------------- "<<std::endl;
@ -283,7 +298,7 @@ public:
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
std::cout << GridLogMessage std::cout << GridLogMessage
<< as[level].actions.at(actionID)->action_name() << as[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] : " <<"["<<level<<"]["<< actionID<<"] :\n\t\t "
<<" force max " << as[level].actions.at(actionID)->deriv_max_average() <<" force max " << as[level].actions.at(actionID)->deriv_max_average()
<<" norm " << as[level].actions.at(actionID)->deriv_norm_average() <<" norm " << as[level].actions.at(actionID)->deriv_norm_average()
<<" Fdt max " << as[level].actions.at(actionID)->Fdt_max_average() <<" Fdt max " << as[level].actions.at(actionID)->Fdt_max_average()
@ -363,9 +378,14 @@ public:
std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl; std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
as[level].actions.at(actionID)->refresh_timer_start(); as[level].actions.at(actionID)->refresh_timer_start();
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
as[level].actions.at(actionID)->refresh_timer_stop(); as[level].actions.at(actionID)->refresh_timer_stop();
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
} }
// Refresh the higher representation actions // Refresh the higher representation actions
@ -402,6 +422,7 @@ public:
// Actions // Actions
for (int level = 0; level < as.size(); ++level) { for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
// get gauge field from the SmearingPolicy and // get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID // based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
@ -411,6 +432,7 @@ public:
as[level].actions.at(actionID)->S_timer_stop(); as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm; H += Hterm;
} }
as[level].apply(S_hireps, Representations, level, H); as[level].apply(S_hireps, Representations, level, H);
} }
@ -418,6 +440,52 @@ public:
return H; return H;
} }
struct _Sinitial {
template <class FieldType, class Repr>
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
for (int a = 0; a < repr_set.size(); ++a) {
RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
H += Hterm;
}
}
} Sinitial_hireps{};
RealD Sinitial(Field& U)
{ // here also U not used
std::cout << GridLogIntegrator << "Integrator initial action\n";
RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
RealD Hterm;
// Actions
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start();
Hterm = as[level].actions.at(actionID)->Sinitial(Us);
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;
}
as[level].apply(Sinitial_hireps, Representations, level, H);
}
return H;
}
void integrate(Field& U) void integrate(Field& U)
{ {
// reset the clocks // reset the clocks

View File

@ -92,10 +92,11 @@ NAMESPACE_BEGIN(Grid);
* P 1/2 P 1/2 * P 1/2 P 1/2
*/ */
template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> > template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
class LeapFrog : public Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy> class LeapFrog : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy>
{ {
public: public:
typedef FieldImplementation_ FieldImplementation;
typedef LeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy> Algorithm; typedef LeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy> Algorithm;
INHERIT_FIELD_TYPES(FieldImplementation); INHERIT_FIELD_TYPES(FieldImplementation);
@ -135,13 +136,14 @@ public:
} }
}; };
template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> > template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
class MinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy> class MinimumNorm2 : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy>
{ {
private: private:
const RealD lambda = 0.1931833275037836; const RealD lambda = 0.1931833275037836;
public: public:
typedef FieldImplementation_ FieldImplementation;
INHERIT_FIELD_TYPES(FieldImplementation); INHERIT_FIELD_TYPES(FieldImplementation);
MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm) MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm)
@ -192,8 +194,8 @@ public:
} }
}; };
template <class FieldImplementation, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> > template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy = Representations<FundamentalRepresentation> >
class ForceGradient : public Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy> class ForceGradient : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy>
{ {
private: private:
const RealD lambda = 1.0 / 6.0; const RealD lambda = 1.0 / 6.0;
@ -202,6 +204,7 @@ private:
const RealD theta = 0.0; const RealD theta = 0.0;
public: public:
typedef FieldImplementation_ FieldImplementation;
INHERIT_FIELD_TYPES(FieldImplementation); INHERIT_FIELD_TYPES(FieldImplementation);
// Looks like dH scales as dt^4. tested wilson/wilson 2 level. // Looks like dH scales as dt^4. tested wilson/wilson 2 level.
@ -227,7 +230,8 @@ public:
// Presently 4 force evals, and should have 3, so 1.33x too expensive. // Presently 4 force evals, and should have 3, so 1.33x too expensive.
// could reduce this with sloppy CG to perhaps 1.15x too expensive // could reduce this with sloppy CG to perhaps 1.15x too expensive
// even without prediction. // even without prediction.
this->update_P(Pfg, Ufg, level, 1.0); this->update_P(Pfg, Ufg, level, fg_dt);
Pfg = Pfg*(1.0/fg_dt);
this->update_U(Pfg, Ufg, fg_dt); this->update_U(Pfg, Ufg, fg_dt);
this->update_P(Ufg, level, ep); this->update_P(Ufg, level, ep);
} }

View File

@ -78,13 +78,13 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
// Now a specific registration with a fermion field // Now a specific registration with a fermion field
// here must instantiate CG and CR for every new fermion field type (macro!!) // here must instantiate CG and CR for every new fermion field type (macro!!)
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>, static Registrar< ConjugateGradientModule<WilsonFermionD::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); HMC_SolverModuleFactory<solver_string, WilsonFermionD::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient");
static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>, static Registrar< BiCGSTABModule<WilsonFermionD::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB"); HMC_SolverModuleFactory<solver_string, WilsonFermionD::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB");
static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>, static Registrar< ConjugateResidualModule<WilsonFermionD::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); HMC_SolverModuleFactory<solver_string, WilsonFermionD::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual");
// add the staggered, scalar versions here // add the staggered, scalar versions here

View File

@ -31,15 +31,16 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
struct TopologySmearingParameters : Serializable { struct TopologySmearingParameters : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters, GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
int, steps,
float, step_size,
int, meas_interval, int, meas_interval,
float, maxTau); float, init_step_size,
float, maxTau,
float, tolerance);
TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f): TopologySmearingParameters(float ss = 0.0f, int mi = 0, float mT = 0.0f, float tol = 1e-4):
steps(s), step_size(ss), meas_interval(mi), maxTau(mT){} init_step_size(ss), meas_interval(mi), maxTau(mT), tolerance(tol){}
template < class ReaderClass > template < class ReaderClass >
TopologySmearingParameters(Reader<ReaderClass>& Reader){ TopologySmearingParameters(Reader<ReaderClass>& Reader){
@ -97,8 +98,8 @@ public:
if (Pars.do_smearing){ if (Pars.do_smearing){
// using wilson flow by default here // using wilson flow by default here
WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); WilsonFlowAdaptive<PeriodicGimplR> WF(Pars.Smearing.init_step_size, Pars.Smearing.maxTau, Pars.Smearing.tolerance, Pars.Smearing.meas_interval);
WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); WF.smear(Usmear, U);
Real T0 = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear); Real T0 = WF.energyDensityPlaquette(Pars.Smearing.maxTau, Usmear);
std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
<< "T0 : [ " << traj << " ] "<< T0 << std::endl; << "T0 : [ " << traj << " ] "<< T0 << std::endl;

View File

@ -33,27 +33,25 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template <class Gimpl> template <class Gimpl>
class WilsonFlow: public Smear<Gimpl>{ class WilsonFlowBase: public Smear<Gimpl>{
public: public:
//Store generic measurements to take during smearing process using std::function //Store generic measurements to take during smearing process using std::function
typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType; //int: step, RealD: flow time, GaugeField : the gauge field typedef std::function<void(int, RealD, const typename Gimpl::GaugeField &)> FunctionType; //int: step, RealD: flow time, GaugeField : the gauge field
private: protected:
unsigned int Nstep;
RealD epsilon; //for regular smearing this is the time step, for adaptive it is the initial time step
std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency std::vector< std::pair<int, FunctionType> > functions; //The int maps to the measurement frequency
mutable WilsonGaugeAction<Gimpl> SG; mutable WilsonGaugeAction<Gimpl> SG;
//Evolve the gauge field by 1 step and update tau
void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
//Evolve the gauge field by 1 step and update tau and the current time step eps
void evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps, RealD maxTau) const;
public: public:
INHERIT_GIMPL_TYPES(Gimpl) INHERIT_GIMPL_TYPES(Gimpl)
explicit WilsonFlowBase(unsigned int meas_interval =1):
SG(WilsonGaugeAction<Gimpl>(3.0)) {
// WilsonGaugeAction with beta 3.0
setDefaultMeasurements(meas_interval);
}
void resetActions(){ functions.clear(); } void resetActions(){ functions.clear(); }
void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); } void addMeasurement(int meas_interval, FunctionType meas){ functions.push_back({meas_interval, meas}); }
@ -64,34 +62,11 @@ public:
//and output to stdout //and output to stdout
void setDefaultMeasurements(int topq_meas_interval = 1); void setDefaultMeasurements(int topq_meas_interval = 1);
explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1): void derivative(GaugeField&, const GaugeField&, const GaugeField&) const override{
Nstep(Nstep),
epsilon(epsilon),
SG(WilsonGaugeAction<Gimpl>(3.0)) {
// WilsonGaugeAction with beta 3.0
assert(epsilon > 0.0);
LogMessage();
setDefaultMeasurements(interval);
}
void LogMessage() {
std::cout << GridLogMessage
<< "[WilsonFlow] Nstep : " << Nstep << std::endl;
std::cout << GridLogMessage
<< "[WilsonFlow] epsilon : " << epsilon << std::endl;
std::cout << GridLogMessage
<< "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl;
}
virtual void smear(GaugeField&, const GaugeField&) const;
virtual void derivative(GaugeField&, const GaugeField&, const GaugeField&) const {
assert(0); assert(0);
// undefined for WilsonFlow // undefined for WilsonFlow
} }
void smear_adaptive(GaugeField&, const GaugeField&, RealD maxTau) const;
//Compute t^2 <E(t)> for time t from the plaquette //Compute t^2 <E(t)> for time t from the plaquette
static RealD energyDensityPlaquette(const RealD t, const GaugeField& U); static RealD energyDensityPlaquette(const RealD t, const GaugeField& U);
@ -115,82 +90,63 @@ public:
std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1); std::vector<RealD> flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval = 1);
}; };
//Basic iterative Wilson flow
template <class Gimpl>
class WilsonFlow: public WilsonFlowBase<Gimpl>{
private:
int Nstep; //number of steps
RealD epsilon; //step size
//Evolve the gauge field by 1 step of size eps and update tau
void evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const;
public:
INHERIT_GIMPL_TYPES(Gimpl)
//Integrate the Wilson flow for Nstep steps of size epsilon
WilsonFlow(const RealD epsilon, const int Nstep, unsigned int meas_interval = 1): WilsonFlowBase<Gimpl>(meas_interval), Nstep(Nstep), epsilon(epsilon){}
void smear(GaugeField& out, const GaugeField& in) const override;
};
//Wilson flow with adaptive step size
template <class Gimpl>
class WilsonFlowAdaptive: public WilsonFlowBase<Gimpl>{
private:
RealD init_epsilon; //initial step size
RealD maxTau; //integrate to t=maxTau
RealD tolerance; //integration error tolerance
//Evolve the gauge field by 1 step and update tau and the current time step eps
//
//If the step size eps is too large that a significant integration error results,
//the gauge field (U) and tau will not be updated and the function will return 0; eps will be adjusted to a smaller
//value for the next iteration.
//
//For a successful integration step the function will return 1
int evolve_step_adaptive(typename Gimpl::GaugeField&U, RealD &tau, RealD &eps) const;
public:
INHERIT_GIMPL_TYPES(Gimpl)
WilsonFlowAdaptive(const RealD init_epsilon, const RealD maxTau, const RealD tolerance, unsigned int meas_interval = 1):
WilsonFlowBase<Gimpl>(meas_interval), init_epsilon(init_epsilon), maxTau(maxTau), tolerance(tolerance){}
void smear(GaugeField& out, const GaugeField& in) const override;
};
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Implementations // Implementations
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class Gimpl> template <class Gimpl>
void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{ RealD WilsonFlowBase<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
GaugeField Z(U.Grid());
GaugeField tmp(U.Grid());
SG.deriv(U, Z);
Z *= 0.25; // Z0 = 1/4 * F(U)
Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0
Z *= -17.0/8.0;
SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1
Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1
Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1
Z *= -4.0/3.0;
SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2
Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2
tau += epsilon;
}
template <class Gimpl>
void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps, RealD maxTau) const{
if (maxTau - tau < eps){
eps = maxTau-tau;
}
//std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
GaugeField Z(U.Grid());
GaugeField Zprime(U.Grid());
GaugeField tmp(U.Grid()), Uprime(U.Grid());
Uprime = U;
SG.deriv(U, Z);
Zprime = -Z;
Z *= 0.25; // Z0 = 1/4 * F(U)
Gimpl::update_field(Z, U, -2.0*eps); // U = W1 = exp(ep*Z0)*W0
Z *= -17.0/8.0;
SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1
Zprime += 2.0*tmp;
Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1
Gimpl::update_field(Z, U, -2.0*eps); // U_= W2 = exp(ep*Z)*W1
Z *= -4.0/3.0;
SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2
Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
Gimpl::update_field(Z, U, -2.0*eps); // V(t+e) = exp(ep*Z)*W2
// Ramos
Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
// Compute distance as norm^2 of the difference
GaugeField diffU = U - Uprime;
RealD diff = norm2(diffU);
// adjust integration step
tau += eps;
//std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
eps = eps*0.95*std::pow(1e-4/diff,1./3.);
//std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
}
template <class Gimpl>
RealD WilsonFlow<Gimpl>::energyDensityPlaquette(const RealD t, const GaugeField& U){
static WilsonGaugeAction<Gimpl> SG(3.0); static WilsonGaugeAction<Gimpl> SG(3.0);
return 2.0 * t * t * SG.S(U)/U.Grid()->gSites(); return 2.0 * t * t * SG.S(U)/U.Grid()->gSites();
} }
//Compute t^2 <E(t)> for time from the 1x1 cloverleaf form //Compute t^2 <E(t)> for time from the 1x1 cloverleaf form
template <class Gimpl> template <class Gimpl>
RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField& U){
typedef typename Gimpl::GaugeLinkField GaugeMat; typedef typename Gimpl::GaugeLinkField GaugeMat;
typedef typename Gimpl::GaugeField GaugeLorentz; typedef typename Gimpl::GaugeField GaugeLorentz;
@ -215,7 +171,7 @@ RealD WilsonFlow<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeField
template <class Gimpl> template <class Gimpl>
std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeField &V, const GaugeField& U, int measure_interval){
std::vector<RealD> out; std::vector<RealD> out;
resetActions(); resetActions();
addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){
@ -227,13 +183,13 @@ std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(GaugeFie
} }
template <class Gimpl> template <class Gimpl>
std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityPlaquette(const GaugeField& U, int measure_interval){
GaugeField V(U); GaugeField V(U);
return flowMeasureEnergyDensityPlaquette(V,U, measure_interval); return flowMeasureEnergyDensityPlaquette(V,U, measure_interval);
} }
template <class Gimpl> template <class Gimpl>
std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeField &V, const GaugeField& U, int measure_interval){
std::vector<RealD> out; std::vector<RealD> out;
resetActions(); resetActions();
addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){ addMeasurement(measure_interval, [&out](int step, RealD t, const typename Gimpl::GaugeField &U){
@ -245,16 +201,52 @@ std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(GaugeFi
} }
template <class Gimpl> template <class Gimpl>
std::vector<RealD> WilsonFlow<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(const GaugeField& U, int measure_interval){
GaugeField V(U); GaugeField V(U);
return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval); return flowMeasureEnergyDensityCloverleaf(V,U, measure_interval);
} }
template <class Gimpl>
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
});
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
});
}
//#define WF_TIMING
template <class Gimpl>
void WilsonFlow<Gimpl>::evolve_step(typename Gimpl::GaugeField &U, RealD &tau) const{
GaugeField Z(U.Grid());
GaugeField tmp(U.Grid());
this->SG.deriv(U, Z);
Z *= 0.25; // Z0 = 1/4 * F(U)
Gimpl::update_field(Z, U, -2.0*epsilon); // U = W1 = exp(ep*Z0)*W0
Z *= -17.0/8.0;
this->SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1
Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1
Gimpl::update_field(Z, U, -2.0*epsilon); // U_= W2 = exp(ep*Z)*W1
Z *= -4.0/3.0;
this->SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2
Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
Gimpl::update_field(Z, U, -2.0*epsilon); // V(t+e) = exp(ep*Z)*W2
tau += epsilon;
}
template <class Gimpl> template <class Gimpl>
void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
std::cout << GridLogMessage
<< "[WilsonFlow] Nstep : " << Nstep << std::endl;
std::cout << GridLogMessage
<< "[WilsonFlow] epsilon : " << epsilon << std::endl;
std::cout << GridLogMessage
<< "[WilsonFlow] full trajectory : " << Nstep * epsilon << std::endl;
out = in; out = in;
RealD taus = 0.; RealD taus = 0.;
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
@ -266,37 +258,93 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
std::cout << "Time to evolve " << diff.count() << " s\n"; std::cout << "Time to evolve " << diff.count() << " s\n";
#endif #endif
//Perform measurements //Perform measurements
for(auto const &meas : functions) for(auto const &meas : this->functions)
if( step % meas.first == 0 ) meas.second(step,taus,out); if( step % meas.first == 0 ) meas.second(step,taus,out);
} }
} }
template <class Gimpl> template <class Gimpl>
void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, RealD maxTau) const{ int WilsonFlowAdaptive<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, RealD &tau, RealD &eps) const{
out = in; if (maxTau - tau < eps){
RealD taus = 0.; eps = maxTau-tau;
RealD eps = epsilon; }
unsigned int step = 0; //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
do{ GaugeField Z(U.Grid());
step++; GaugeField Zprime(U.Grid());
//std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; GaugeField tmp(U.Grid()), Uprime(U.Grid()), Usave(U.Grid());
evolve_step_adaptive(out, taus, eps, maxTau); Uprime = U;
//Perform measurements Usave = U;
for(auto const &meas : functions)
if( step % meas.first == 0 ) meas.second(step,taus,out); this->SG.deriv(U, Z);
} while (taus < maxTau); Zprime = -Z;
Z *= 0.25; // Z0 = 1/4 * F(U)
Gimpl::update_field(Z, U, -2.0*eps); // U = W1 = exp(ep*Z0)*W0
Z *= -17.0/8.0;
this->SG.deriv(U, tmp); Z += tmp; // -17/32*Z0 +Z1
Zprime += 2.0*tmp;
Z *= 8.0/9.0; // Z = -17/36*Z0 +8/9*Z1
Gimpl::update_field(Z, U, -2.0*eps); // U_= W2 = exp(ep*Z)*W1
Z *= -4.0/3.0;
this->SG.deriv(U, tmp); Z += tmp; // 4/3*(17/36*Z0 -8/9*Z1) +Z2
Z *= 3.0/4.0; // Z = 17/36*Z0 -8/9*Z1 +3/4*Z2
Gimpl::update_field(Z, U, -2.0*eps); // V(t+e) = exp(ep*Z)*W2
// Ramos arXiv:1301.4388
Gimpl::update_field(Zprime, Uprime, -2.0*eps); // V'(t+e) = exp(ep*Z')*W0
// Compute distance using Ramos' definition
GaugeField diffU = U - Uprime;
RealD max_dist = 0;
for(int mu=0;mu<Nd;mu++){
typename Gimpl::GaugeLinkField diffU_mu = PeekIndex<LorentzIndex>(diffU, mu);
RealD dist_mu = sqrt( maxLocalNorm2(diffU_mu) ) /Nc/Nc; //maximize over sites
max_dist = std::max(max_dist, dist_mu); //maximize over mu
}
int ret;
if(max_dist < tolerance) {
tau += eps;
ret = 1;
} else {
U = Usave;
ret = 0;
}
eps = eps*0.95*std::pow(tolerance/max_dist,1./3.);
std::cout << GridLogMessage << "Adaptive smearing : Distance: "<< max_dist <<" Step successful: " << ret << " New epsilon: " << eps << std::endl;
return ret;
} }
template <class Gimpl> template <class Gimpl>
void WilsonFlow<Gimpl>::setDefaultMeasurements(int topq_meas_interval){ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){ std::cout << GridLogMessage
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl; << "[WilsonFlow] initial epsilon : " << init_epsilon << std::endl;
}); std::cout << GridLogMessage
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){ << "[WilsonFlow] full trajectory : " << maxTau << std::endl;
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl; std::cout << GridLogMessage
}); << "[WilsonFlow] tolerance : " << tolerance << std::endl;
out = in;
RealD taus = 0.;
RealD eps = init_epsilon;
unsigned int step = 0;
do{
int step_success = evolve_step_adaptive(out, taus, eps);
step += step_success; //step will not be incremented if the integration step fails
//Perform measurements
if(step_success)
for(auto const &meas : this->functions)
if( step % meas.first == 0 ) meas.second(step,taus,out);
} while (taus < maxTau);
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -227,26 +227,38 @@ namespace ConjugateBC {
//shift = -1 //shift = -1
//Out(x) = U_\mu(x-mu) | x_\mu != 0 //Out(x) = U_\mu(x-mu) | x_\mu != 0
// = U*_\mu(L-1) | x_\mu == 0 // = U*_\mu(L-1) | x_\mu == 0
//shift = 2
//Out(x) = U_\mu(x+2\hat\mu) | x_\mu < L-2
// = U*_\mu(1) | x_\mu == L-1
// = U*_\mu(0) | x_\mu == L-2
//shift = -2
//Out(x) = U_\mu(x-2mu) | x_\mu > 1
// = U*_\mu(L-2) | x_\mu == 0
// = U*_\mu(L-1) | x_\mu == 1
//etc
template<class gauge> Lattice<gauge> template<class gauge> Lattice<gauge>
CshiftLink(const Lattice<gauge> &Link, int mu, int shift) CshiftLink(const Lattice<gauge> &Link, int mu, int shift)
{ {
GridBase *grid = Link.Grid(); GridBase *grid = Link.Grid();
int Lmu = grid->GlobalDimensions()[mu] - 1; int Lmu = grid->GlobalDimensions()[mu];
assert(abs(shift) < Lmu && "Invalid shift value");
Lattice<iScalar<vInteger>> coor(grid); Lattice<iScalar<vInteger>> coor(grid);
LatticeCoordinate(coor, mu); LatticeCoordinate(coor, mu);
Lattice<gauge> tmp(grid); Lattice<gauge> tmp(grid);
if(shift == 1){ if(shift > 0){
tmp = Cshift(Link, mu, 1); tmp = Cshift(Link, mu, shift);
tmp = where(coor == Lmu, conjugate(tmp), tmp); tmp = where(coor >= Lmu-shift, conjugate(tmp), tmp);
return tmp; return tmp;
}else if(shift == -1){ }else if(shift < 0){
tmp = Link; tmp = Link;
tmp = where(coor == Lmu, conjugate(tmp), tmp); tmp = where(coor >= Lmu+shift, conjugate(tmp), tmp);
return Cshift(tmp, mu, -1); return Cshift(tmp, mu, shift);
}else assert(0 && "Invalid shift value"); }
return tmp; //shuts up the compiler fussing about the return type
//shift == 0
return Link;
} }
} }

View File

@ -72,12 +72,12 @@ public:
//Fix the gauge field Umu //Fix the gauge field Umu
//0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf //0 < alpha < 1 is related to the step size, cf https://arxiv.org/pdf/1405.5812.pdf
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) { static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
GaugeMat xform(grid); GaugeMat xform(grid);
SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge); SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog,err_on_no_converge);
} }
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) { static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1,bool err_on_no_converge=true) {
//Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform //Fix the gauge field Umu and also return the gauge transformation from the original gauge field, xform
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();

View File

@ -615,7 +615,6 @@ public:
GridBase *grid = out.Grid(); GridBase *grid = out.Grid();
typedef typename LatticeMatrixType::vector_type vector_type; typedef typename LatticeMatrixType::vector_type vector_type;
typedef typename LatticeMatrixType::scalar_type scalar_type;
typedef iSinglet<vector_type> vTComplexType; typedef iSinglet<vector_type> vTComplexType;

View File

@ -26,7 +26,7 @@
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/Grid.h> #include <Grid/Grid.h>
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) #ifndef GRID_HIP
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -82,7 +82,7 @@ void JSONWriter::writeDefault(const std::string &s, const std::string &x)
if (s.size()) if (s.size())
ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ; ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ;
else else
ss_ << os.str() << " ," ; ss_ << "\""<< os.str() << "\" ," ;
} }
// Reader implementation /////////////////////////////////////////////////////// // Reader implementation ///////////////////////////////////////////////////////

View File

@ -54,7 +54,7 @@ namespace Grid
void pop(void); void pop(void);
template <typename U> template <typename U>
void writeDefault(const std::string &s, const U &x); void writeDefault(const std::string &s, const U &x);
#ifdef __NVCC__ #if defined(GRID_CUDA) || defined(GRID_HIP)
void writeDefault(const std::string &s, const Grid::ComplexD &x) void writeDefault(const std::string &s, const Grid::ComplexD &x)
{ {
std::complex<double> z(real(x),imag(x)); std::complex<double> z(real(x),imag(x));
@ -101,7 +101,7 @@ namespace Grid
void readDefault(const std::string &s, std::vector<U> &output); void readDefault(const std::string &s, std::vector<U> &output);
template <typename U, typename P> template <typename U, typename P>
void readDefault(const std::string &s, std::pair<U,P> &output); void readDefault(const std::string &s, std::pair<U,P> &output);
#ifdef __NVCC__ #if defined(GRID_CUDA) || defined(GRID_HIP)
void readDefault(const std::string &s, ComplexD &output) void readDefault(const std::string &s, ComplexD &output)
{ {
std::complex<double> z; std::complex<double> z;

View File

@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "BinaryIO.h" #include "BinaryIO.h"
#include "TextIO.h" #include "TextIO.h"
#include "XmlIO.h" #include "XmlIO.h"
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) #ifndef GRID_HIP
#include "JSON_IO.h" #include "JSON_IO.h"
#endif #endif

View File

@ -501,7 +501,7 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
// Complex // Complex
template <typename T> template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){ inline vec<T> operator()(vec<T> a){
vec<T> out; vec<T> out;
const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap(); const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
svbool_t pg1 = acle<T>::pg1(); svbool_t pg1 = acle<T>::pg1();
@ -520,7 +520,7 @@ struct TimesMinusI{
struct TimesI{ struct TimesI{
// Complex // Complex
template <typename T> template <typename T>
inline vec<T> operator()(vec<T> a, vec<T> b){ inline vec<T> operator()(vec<T> a){
vec<T> out; vec<T> out;
const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap(); const vec<typename acle<T>::uint> tbl_swap = acle<T>::tbl_swap();
svbool_t pg1 = acle<T>::pg1(); svbool_t pg1 = acle<T>::pg1();

View File

@ -418,7 +418,7 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
// Complex float // Complex float
inline vecf operator()(vecf a, vecf b){ inline vecf operator()(vecf a){
lutf tbl_swap = acle<float>::tbl_swap(); lutf tbl_swap = acle<float>::tbl_swap();
pred pg1 = acle<float>::pg1(); pred pg1 = acle<float>::pg1();
pred pg_odd = acle<float>::pg_odd(); pred pg_odd = acle<float>::pg_odd();
@ -428,7 +428,7 @@ struct TimesMinusI{
return svneg_m(a_v, pg_odd, a_v); return svneg_m(a_v, pg_odd, a_v);
} }
// Complex double // Complex double
inline vecd operator()(vecd a, vecd b){ inline vecd operator()(vecd a){
lutd tbl_swap = acle<double>::tbl_swap(); lutd tbl_swap = acle<double>::tbl_swap();
pred pg1 = acle<double>::pg1(); pred pg1 = acle<double>::pg1();
pred pg_odd = acle<double>::pg_odd(); pred pg_odd = acle<double>::pg_odd();
@ -441,7 +441,7 @@ struct TimesMinusI{
struct TimesI{ struct TimesI{
// Complex float // Complex float
inline vecf operator()(vecf a, vecf b){ inline vecf operator()(vecf a){
lutf tbl_swap = acle<float>::tbl_swap(); lutf tbl_swap = acle<float>::tbl_swap();
pred pg1 = acle<float>::pg1(); pred pg1 = acle<float>::pg1();
pred pg_even = acle<float>::pg_even(); pred pg_even = acle<float>::pg_even();
@ -451,7 +451,7 @@ struct TimesI{
return svneg_m(a_v, pg_even, a_v); return svneg_m(a_v, pg_even, a_v);
} }
// Complex double // Complex double
inline vecd operator()(vecd a, vecd b){ inline vecd operator()(vecd a){
lutd tbl_swap = acle<double>::tbl_swap(); lutd tbl_swap = acle<double>::tbl_swap();
pred pg1 = acle<double>::pg1(); pred pg1 = acle<double>::pg1();
pred pg_even = acle<double>::pg_even(); pred pg_even = acle<double>::pg_even();

View File

@ -405,12 +405,12 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
//Complex single //Complex single
inline __m256 operator()(__m256 in, __m256 ret){ inline __m256 operator()(__m256 in){
__m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i
return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r
} }
//Complex double //Complex double
inline __m256d operator()(__m256d in, __m256d ret){ inline __m256d operator()(__m256d in){
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i
return _mm256_shuffle_pd(tmp,tmp,0x5); return _mm256_shuffle_pd(tmp,tmp,0x5);
} }
@ -418,12 +418,12 @@ struct TimesMinusI{
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m256 operator()(__m256 in, __m256 ret){ inline __m256 operator()(__m256 in){
__m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r
return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r
} }
//Complex double //Complex double
inline __m256d operator()(__m256d in, __m256d ret){ inline __m256d operator()(__m256d in){
__m256d tmp = _mm256_shuffle_pd(in,in,0x5); __m256d tmp = _mm256_shuffle_pd(in,in,0x5);
return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r
} }

View File

@ -271,14 +271,14 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
//Complex single //Complex single
inline __m512 operator()(__m512 in, __m512 ret){ inline __m512 operator()(__m512 in){
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E?? //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
} }
//Complex double //Complex double
inline __m512d operator()(__m512d in, __m512d ret){ inline __m512d operator()(__m512d in){
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
//return _mm512_shuffle_pd(tmp,tmp,0x55); //return _mm512_shuffle_pd(tmp,tmp,0x55);
__m512d tmp = _mm512_shuffle_pd(in,in,0x55); __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
@ -288,17 +288,16 @@ struct TimesMinusI{
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m512 operator()(__m512 in, __m512 ret){ inline __m512 operator()(__m512 in){
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
} }
//Complex double //Complex double
inline __m512d operator()(__m512d in, __m512d ret){ inline __m512d operator()(__m512d in){
__m512d tmp = _mm512_shuffle_pd(in,in,0x55); __m512d tmp = _mm512_shuffle_pd(in,in,0x55);
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
} }
}; };
// Gpermute utilities consider coalescing into 1 Gpermute // Gpermute utilities consider coalescing into 1 Gpermute

View File

@ -0,0 +1,666 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_vector_types.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template <class Scalar_type, class Vector_type>
class Grid_simd2 {
public:
typedef typename RealPart<Scalar_type>::type Real;
typedef Vector_type vector_type;
typedef Scalar_type scalar_type;
typedef union conv_t_union {
Vector_type v;
Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
accelerator_inline conv_t_union(){};
} conv_t;
static constexpr int nvec=2;
Vector_type v[nvec];
static accelerator_inline constexpr int Nsimd(void) {
static_assert( (sizeof(Vector_type) / sizeof(Scalar_type) >= 1), " size mismatch " );
return nvec*sizeof(Vector_type) / sizeof(Scalar_type);
}
accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &&rhs) {
for(int n=0;n<nvec;n++) v[n] = rhs.v[n];
return *this;
};
accelerator_inline Grid_simd2 &operator=(const Grid_simd2 &rhs) {
for(int n=0;n<nvec;n++) v[n] = rhs.v[n];
return *this;
}; // faster than not declaring it and leaving to the compiler
accelerator Grid_simd2() = default;
accelerator_inline Grid_simd2(const Grid_simd2 &rhs) { for(int n=0;n<nvec;n++) v[n] = rhs.v[n]; };
accelerator_inline Grid_simd2(const Grid_simd2 &&rhs){ for(int n=0;n<nvec;n++) v[n] = rhs.v[n]; };
accelerator_inline Grid_simd2(const Real a) { vsplat(*this, Scalar_type(a)); };
// Enable if complex type
template <typename S = Scalar_type> accelerator_inline
Grid_simd2(const typename std::enable_if<is_complex<S>::value, S>::type a) {
vsplat(*this, a);
};
/////////////////////////////
// Constructors
/////////////////////////////
accelerator_inline Grid_simd2 & operator=(const Zero &z) {
vzero(*this);
return (*this);
}
///////////////////////////////////////////////
// mac, mult, sub, add, adj
///////////////////////////////////////////////
friend accelerator_inline void mac(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ a,
const Grid_simd2 *__restrict__ x) {
*y = (*a) * (*x) + (*y);
};
friend accelerator_inline void mult(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ l,
const Grid_simd2 *__restrict__ r) {
*y = (*l) * (*r);
}
friend accelerator_inline void sub(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ l,
const Grid_simd2 *__restrict__ r) {
*y = (*l) - (*r);
}
friend accelerator_inline void add(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ l,
const Grid_simd2 *__restrict__ r) {
*y = (*l) + (*r);
}
friend accelerator_inline void mac(Grid_simd2 *__restrict__ y,
const Scalar_type *__restrict__ a,
const Grid_simd2 *__restrict__ x) {
*y = (*a) * (*x) + (*y);
};
friend accelerator_inline void mult(Grid_simd2 *__restrict__ y,
const Scalar_type *__restrict__ l,
const Grid_simd2 *__restrict__ r) {
*y = (*l) * (*r);
}
friend accelerator_inline void sub(Grid_simd2 *__restrict__ y,
const Scalar_type *__restrict__ l,
const Grid_simd2 *__restrict__ r) {
*y = (*l) - (*r);
}
friend accelerator_inline void add(Grid_simd2 *__restrict__ y,
const Scalar_type *__restrict__ l,
const Grid_simd2 *__restrict__ r) {
*y = (*l) + (*r);
}
friend accelerator_inline void mac(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ a,
const Scalar_type *__restrict__ x) {
*y = (*a) * (*x) + (*y);
};
friend accelerator_inline void mult(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ l,
const Scalar_type *__restrict__ r) {
*y = (*l) * (*r);
}
friend accelerator_inline void sub(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ l,
const Scalar_type *__restrict__ r) {
*y = (*l) - (*r);
}
friend accelerator_inline void add(Grid_simd2 *__restrict__ y,
const Grid_simd2 *__restrict__ l,
const Scalar_type *__restrict__ r) {
*y = (*l) + (*r);
}
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
friend accelerator_inline void vset(Grid_simd2 &ret, Scalar_type *a) {
for(int n=0;n<nvec;n++) vset(ret.v[n],a);
}
///////////////////////
// Vstore
///////////////////////
friend accelerator_inline void vstore(const Grid_simd2 &ret, Scalar_type *a) {
for(int n=0;n<nvec;n++) vstore(ret.v[n],a);
}
///////////////////////
// Vprefetch
///////////////////////
friend accelerator_inline void vprefetch(const Grid_simd2 &v) {
vprefetch(v.v[0]);
}
///////////////////////
// Reduce
///////////////////////
friend accelerator_inline Scalar_type Reduce(const Grid_simd2 &in) {
return Reduce(in.v[0])+ Reduce(in.v[1]);
}
////////////////////////////
// operator scalar * simd
////////////////////////////
friend accelerator_inline Grid_simd2 operator*(const Scalar_type &a, Grid_simd2 b) {
Grid_simd2 va;
vsplat(va, a);
return va * b;
}
friend accelerator_inline Grid_simd2 operator*(Grid_simd2 b, const Scalar_type &a) {
return a * b;
}
//////////////////////////////////
// Divides
//////////////////////////////////
friend accelerator_inline Grid_simd2 operator/(const Scalar_type &a, Grid_simd2 b) {
Grid_simd2 va;
vsplat(va, a);
return va / b;
}
friend accelerator_inline Grid_simd2 operator/(Grid_simd2 b, const Scalar_type &a) {
Grid_simd2 va;
vsplat(va, a);
return b / a;
}
///////////////////////
// Unary negation
///////////////////////
friend accelerator_inline Grid_simd2 operator-(const Grid_simd2 &r) {
Grid_simd2 ret;
vzero(ret);
ret = ret - r;
return ret;
}
// *=,+=,-= operators
accelerator_inline Grid_simd2 &operator*=(const Grid_simd2 &r) {
*this = (*this) * r;
return *this;
}
accelerator_inline Grid_simd2 &operator+=(const Grid_simd2 &r) {
*this = *this + r;
return *this;
}
accelerator_inline Grid_simd2 &operator-=(const Grid_simd2 &r) {
*this = *this - r;
return *this;
}
///////////////////////////////////////
// Not all functions are supported
// through SIMD and must breakout to
// scalar type and back again. This
// provides support
///////////////////////////////////////
template <class functor>
friend accelerator_inline Grid_simd2 SimdApply(const functor &func, const Grid_simd2 &v) {
Grid_simd2 ret;
for(int n=0;n<nvec;n++){
ret.v[n]=SimdApply(func,v.v[n]);
}
return ret;
}
template <class functor>
friend accelerator_inline Grid_simd2 SimdApplyBinop(const functor &func,
const Grid_simd2 &x,
const Grid_simd2 &y) {
Grid_simd2 ret;
for(int n=0;n<nvec;n++){
ret.v[n]=SimdApplyBinop(func,x.v[n],y.v[n]);
}
return ret;
}
///////////////////////
// Exchange
// Al Ah , Bl Bh -> Al Bl Ah,Bh
///////////////////////
friend accelerator_inline void exchange0(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
out1.v[0] = in1.v[0];
out1.v[1] = in2.v[0];
out2.v[0] = in1.v[1];
out2.v[1] = in2.v[1];
}
friend accelerator_inline void exchange1(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
exchange0(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
exchange0(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
}
friend accelerator_inline void exchange2(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
exchange1(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
exchange1(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
}
friend accelerator_inline void exchange3(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
exchange2(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
exchange2(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
}
friend accelerator_inline void exchange4(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2){
exchange3(out1.v[0],out2.v[0],in1.v[0],in2.v[0]);
exchange3(out1.v[1],out2.v[1],in1.v[1],in2.v[1]);
}
friend accelerator_inline void exchange(Grid_simd2 &out1,Grid_simd2 &out2,Grid_simd2 in1,Grid_simd2 in2,int n)
{
if (n==3) {
exchange3(out1,out2,in1,in2);
} else if(n==2) {
exchange2(out1,out2,in1,in2);
} else if(n==1) {
exchange1(out1,out2,in1,in2);
} else if(n==0) {
exchange0(out1,out2,in1,in2);
}
}
////////////////////////////////////////////////////////////////////
// General permute; assumes vector length is same across
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
friend accelerator_inline void permute0(Grid_simd2 &y, Grid_simd2 b) {
y.v[0]=b.v[1];
y.v[1]=b.v[0];
}
friend accelerator_inline void permute1(Grid_simd2 &y, Grid_simd2 b) {
permute0(y.v[0],b.v[0]);
permute0(y.v[1],b.v[1]);
}
friend accelerator_inline void permute2(Grid_simd2 &y, Grid_simd2 b) {
permute1(y.v[0],b.v[0]);
permute1(y.v[1],b.v[1]);
}
friend accelerator_inline void permute3(Grid_simd2 &y, Grid_simd2 b) {
permute2(y.v[0],b.v[0]);
permute2(y.v[1],b.v[1]);
}
friend accelerator_inline void permute4(Grid_simd2 &y, Grid_simd2 b) {
permute3(y.v[0],b.v[0]);
permute3(y.v[1],b.v[1]);
}
friend accelerator_inline void permute(Grid_simd2 &y, Grid_simd2 b, int perm) {
if(perm==3) permute3(y, b);
else if(perm==2) permute2(y, b);
else if(perm==1) permute1(y, b);
else if(perm==0) permute0(y, b);
}
///////////////////////////////
// Getting single lanes
///////////////////////////////
accelerator_inline Scalar_type getlane(int lane) const {
if(lane < vector_type::Nsimd() ) return v[0].getlane(lane);
else return v[1].getlane(lane%vector_type::Nsimd());
}
accelerator_inline void putlane(const Scalar_type &S, int lane){
if(lane < vector_type::Nsimd() ) v[0].putlane(S,lane);
else v[1].putlane(S,lane%vector_type::Nsimd());
}
}; // end of Grid_simd2 class definition
///////////////////////////////
// Define available types
///////////////////////////////
typedef Grid_simd2<complex<double> , vComplexD> vComplexD2;
typedef Grid_simd2<double , vRealD> vRealD2;
/////////////////////////////////////////
// Some traits to recognise the types
/////////////////////////////////////////
template <typename T>
struct is_simd : public std::false_type {};
template <> struct is_simd<vRealF> : public std::true_type {};
template <> struct is_simd<vRealD> : public std::true_type {};
template <> struct is_simd<vRealH> : public std::true_type {};
template <> struct is_simd<vComplexF> : public std::true_type {};
template <> struct is_simd<vComplexD> : public std::true_type {};
template <> struct is_simd<vComplexH> : public std::true_type {};
template <> struct is_simd<vInteger> : public std::true_type {};
template <> struct is_simd<vRealD2> : public std::true_type {};
template <> struct is_simd<vComplexD2> : public std::true_type {};
template <typename T> using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
///////////////////////////////////////////////
// insert / extract with complex support
///////////////////////////////////////////////
template <class S, class V>
accelerator_inline S getlane(const Grid_simd<S, V> &in,int lane) {
return in.getlane(lane);
}
template <class S, class V>
accelerator_inline void putlane(Grid_simd<S, V> &vec,const S &_S, int lane){
vec.putlane(_S,lane);
}
template <class S,IfNotSimd<S> = 0 >
accelerator_inline S getlane(const S &in,int lane) {
return in;
}
template <class S,IfNotSimd<S> = 0 >
accelerator_inline void putlane(S &vec,const S &_S, int lane){
vec = _S;
}
template <class S, class V>
accelerator_inline S getlane(const Grid_simd2<S, V> &in,int lane) {
return in.getlane(lane);
}
template <class S, class V>
accelerator_inline void putlane(Grid_simd2<S, V> &vec,const S &_S, int lane){
vec.putlane(_S,lane);
}
////////////////////////////////////////////////////////////////////
// General rotate
////////////////////////////////////////////////////////////////////
template <class S, class V>
accelerator_inline void vbroadcast(Grid_simd2<S,V> &ret,const Grid_simd2<S,V> &src,int lane){
S* typepun =(S*) &src;
vsplat(ret,typepun[lane]);
}
template <class S, class V, IfComplex<S> =0>
accelerator_inline void rbroadcast(Grid_simd2<S,V> &ret,const Grid_simd2<S,V> &src,int lane){
typedef typename V::vector_type vector_type;
S* typepun =(S*) &src;
ret.v[0].v = unary<vector_type>(real(typepun[lane]), VsplatSIMD());
ret.v[1].v = unary<vector_type>(real(typepun[lane]), VsplatSIMD());
}
///////////////////////
// Splat
///////////////////////
// this is only for the complex version
template <class S, class V, IfComplex<S> = 0, class ABtype>
accelerator_inline void vsplat(Grid_simd2<S, V> &ret, ABtype a, ABtype b) {
vsplat(ret.v[0],a,b);
vsplat(ret.v[1],a,b);
}
// overload if complex
template <class S, class V>
accelerator_inline void vsplat(Grid_simd2<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), imag(c));
}
template <class S, class V>
accelerator_inline void rsplat(Grid_simd2<S, V> &ret, EnableIf<is_complex<S>, S> c) {
vsplat(ret, real(c), real(c));
}
// if real fill with a, if complex fill with a in the real part (first function
// above)
template <class S, class V>
accelerator_inline void vsplat(Grid_simd2<S, V> &ret, NotEnableIf<is_complex<S>, S> a)
{
vsplat(ret.v[0],a);
vsplat(ret.v[1],a);
}
//////////////////////////
///////////////////////////////////////////////
// Initialise to 1,0,i for the correct types
///////////////////////////////////////////////
// For complex types
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void vone(Grid_simd2<S, V> &ret) {
vsplat(ret, S(1.0, 0.0));
}
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void vzero(Grid_simd2<S, V> &ret) {
vsplat(ret, S(0.0, 0.0));
} // use xor?
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void vcomplex_i(Grid_simd2<S, V> &ret) {
vsplat(ret, S(0.0, 1.0));
}
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void visign(Grid_simd2<S, V> &ret) {
vsplat(ret, S(1.0, -1.0));
}
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void vrsign(Grid_simd2<S, V> &ret) {
vsplat(ret, S(-1.0, 1.0));
}
// if not complex overload here
template <class S, class V, IfReal<S> = 0>
accelerator_inline void vone(Grid_simd2<S, V> &ret) {
vsplat(ret, S(1.0));
}
template <class S, class V, IfReal<S> = 0>
accelerator_inline void vzero(Grid_simd2<S, V> &ret) {
vsplat(ret, S(0.0));
}
// For integral types
template <class S, class V, IfInteger<S> = 0>
accelerator_inline void vone(Grid_simd2<S, V> &ret) {
vsplat(ret, 1);
}
template <class S, class V, IfInteger<S> = 0>
accelerator_inline void vzero(Grid_simd2<S, V> &ret) {
vsplat(ret, 0);
}
template <class S, class V, IfInteger<S> = 0>
accelerator_inline void vtrue(Grid_simd2<S, V> &ret) {
vsplat(ret, 0xFFFFFFFF);
}
template <class S, class V, IfInteger<S> = 0>
accelerator_inline void vfalse(Grid_simd2<S, V> &ret) {
vsplat(ret, 0);
}
template <class S, class V>
accelerator_inline void zeroit(Grid_simd2<S, V> &z) {
vzero(z);
}
///////////////////////
// Vstream
///////////////////////
template <class S, class V, IfReal<S> = 0>
accelerator_inline void vstream(Grid_simd2<S, V> &out, const Grid_simd2<S, V> &in) {
vstream(out.v[0],in.v[0]);
vstream(out.v[1],in.v[1]);
}
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void vstream(Grid_simd2<S, V> &out, const Grid_simd2<S, V> &in) {
vstream(out.v[0],in.v[0]);
vstream(out.v[1],in.v[1]);
}
template <class S, class V, IfInteger<S> = 0>
accelerator_inline void vstream(Grid_simd2<S, V> &out, const Grid_simd2<S, V> &in) {
vstream(out.v[0],in.v[0]);
vstream(out.v[1],in.v[1]);
}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
template <class S, class V>
accelerator_inline Grid_simd2<S, V> operator+(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
Grid_simd2<S, V> ret;
ret.v[0] = a.v[0]+b.v[0];
ret.v[1] = a.v[1]+b.v[1];
return ret;
};
template <class S, class V>
accelerator_inline Grid_simd2<S, V> operator-(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
Grid_simd2<S, V> ret;
ret.v[0] = a.v[0]-b.v[0];
ret.v[1] = a.v[1]-b.v[1];
return ret;
};
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd2<S, V> real_mult(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
Grid_simd2<S, V> ret;
ret.v[0] =real_mult(a.v[0],b.v[0]);
ret.v[1] =real_mult(a.v[1],b.v[1]);
return ret;
};
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd2<S, V> real_madd(Grid_simd2<S, V> a, Grid_simd2<S, V> b, Grid_simd2<S,V> c) {
Grid_simd2<S, V> ret;
ret.v[0] =real_madd(a.v[0],b.v[0],c.v[0]);
ret.v[1] =real_madd(a.v[1],b.v[1],c.v[1]);
return ret;
};
// Distinguish between complex types and others
template <class S, class V>
accelerator_inline Grid_simd2<S, V> operator*(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
Grid_simd2<S, V> ret;
ret.v[0] = a.v[0]*b.v[0];
ret.v[1] = a.v[1]*b.v[1];
return ret;
};
// Distinguish between complex types and others
template <class S, class V>
accelerator_inline Grid_simd2<S, V> operator/(Grid_simd2<S, V> a, Grid_simd2<S, V> b) {
Grid_simd2<S, V> ret;
ret.v[0] = a.v[0]/b.v[0];
ret.v[1] = a.v[1]/b.v[1];
return ret;
};
///////////////////////
// Conjugate
///////////////////////
template <class S, class V>
accelerator_inline Grid_simd2<S, V> conjugate(const Grid_simd2<S, V> &in) {
Grid_simd2<S, V> ret;
ret.v[0] = conjugate(in.v[0]);
ret.v[1] = conjugate(in.v[1]);
return ret;
}
template <class S, class V, IfNotInteger<S> = 0>
accelerator_inline Grid_simd2<S, V> adj(const Grid_simd2<S, V> &in) {
return conjugate(in);
}
///////////////////////
// timesMinusI
///////////////////////
template <class S, class V>
accelerator_inline void timesMinusI(Grid_simd2<S, V> &ret, const Grid_simd2<S, V> &in) {
timesMinusI(ret.v[0],in.v[0]);
timesMinusI(ret.v[1],in.v[1]);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> timesMinusI(const Grid_simd2<S, V> &in) {
Grid_simd2<S, V> ret;
timesMinusI(ret.v[0],in.v[0]);
timesMinusI(ret.v[1],in.v[1]);
return ret;
}
///////////////////////
// timesI
///////////////////////
template <class S, class V>
accelerator_inline void timesI(Grid_simd2<S, V> &ret, const Grid_simd2<S, V> &in) {
timesI(ret.v[0],in.v[0]);
timesI(ret.v[1],in.v[1]);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> timesI(const Grid_simd2<S, V> &in) {
Grid_simd2<S, V> ret;
timesI(ret.v[0],in.v[0]);
timesI(ret.v[1],in.v[1]);
return ret;
}
/////////////////////
// Inner, outer
/////////////////////
template <class S, class V>
accelerator_inline Grid_simd2<S, V> innerProduct(const Grid_simd2<S, V> &l,const Grid_simd2<S, V> &r) {
return conjugate(l) * r;
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> outerProduct(const Grid_simd2<S, V> &l,const Grid_simd2<S, V> &r) {
return l * conjugate(r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> trace(const Grid_simd2<S, V> &arg) {
return arg;
}
////////////////////////////////////////////////////////////
// copy/splat complex real parts into real;
// insert real into complex and zero imag;
////////////////////////////////////////////////////////////
accelerator_inline void precisionChange(vComplexD2 &out,const vComplexF &in){
Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v);
}
accelerator_inline void precisionChange(vComplexF &out,const vComplexD2 &in){
out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v);
}
accelerator_inline void precisionChange(vComplexD2 *out,const vComplexF *in,int nvec){
for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
}
accelerator_inline void precisionChange(vComplexF *out,const vComplexD2 *in,int nvec){
for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
}
accelerator_inline void precisionChange(vRealD2 &out,const vRealF &in){
Optimization::PrecisionChange::StoD(in.v,out.v[0].v,out.v[1].v);
}
accelerator_inline void precisionChange(vRealF &out,const vRealD2 &in){
out.v=Optimization::PrecisionChange::DtoS(in.v[0].v,in.v[1].v);
}
accelerator_inline void precisionChange(vRealD2 *out,const vRealF *in,int nvec){
for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
}
accelerator_inline void precisionChange(vRealF *out,const vRealD2 *in,int nvec){
for(int m=0;m<nvec;m++){ precisionChange(out[m],in[m]); }
}
NAMESPACE_END(Grid);

View File

@ -244,7 +244,7 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
// Complex // Complex
template <typename T> template <typename T>
accelerator_inline vec<T> operator()(vec<T> a, vec<T> b){ accelerator_inline vec<T> operator()(vec<T> a){
vec<T> out; vec<T> out;
VECTOR_FOR(i, W<T>::c, 1) VECTOR_FOR(i, W<T>::c, 1)
@ -265,7 +265,7 @@ struct TimesMinusI{
struct TimesI{ struct TimesI{
// Complex // Complex
template <typename T> template <typename T>
accelerator_inline vec<T> operator()(vec<T> a, vec<T> b){ accelerator_inline vec<T> operator()(vec<T> a){
vec<T> out; vec<T> out;
VECTOR_FOR(i, W<T>::c, 1) VECTOR_FOR(i, W<T>::c, 1)

878
Grid/simd/Grid_gpu_rrii.h Normal file
View File

@ -0,0 +1,878 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_gpu.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_gpu_rrii.h*/
//----------------------------------------------------------------------
//////////////////////////////
// fp16
//////////////////////////////
#ifdef GRID_CUDA
#include <cuda_fp16.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_fp16.h>
#endif
#if !defined(GRID_HIP) && !defined(GRID_CUDA)
namespace Grid {
typedef struct { uint16_t x;} half;
}
#endif
namespace Grid {
accelerator_inline float half2float(half h)
{
float f;
#if defined(GRID_CUDA) || defined(GRID_HIP)
f = __half2float(h);
#else
Grid_half hh;
hh.x = h.x;
f= sfw_half_to_float(hh);
#endif
return f;
}
accelerator_inline half float2half(float f)
{
half h;
#if defined(GRID_CUDA) || defined(GRID_HIP)
h = __float2half(f);
#else
Grid_half hh = sfw_float_to_half(f);
h.x = hh.x;
#endif
return h;
}
}
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
namespace Grid {
////////////////////////////////////////////////////////////////////////
// Real vector
////////////////////////////////////////////////////////////////////////
template<int _N, class _datum>
struct GpuVector {
_datum rrrr[_N];
static const int N = _N;
typedef _datum datum;
};
template<int N,class datum>
inline accelerator GpuVector<N,datum> operator*(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
GpuVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]*r.rrrr[i];
}
return ret;
}
template<int N,class datum>
inline accelerator GpuVector<N,datum> operator-(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
GpuVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]-r.rrrr[i];
}
return ret;
}
template<int N,class datum>
inline accelerator GpuVector<N,datum> operator+(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
GpuVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]+r.rrrr[i];
}
return ret;
}
template<int N,class datum>
inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
GpuVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]/r.rrrr[i];
}
return ret;
}
////////////////////////////////////////////////////////////////////////
// Complex vector
////////////////////////////////////////////////////////////////////////
template<int _N, class _datum>
struct GpuComplexVector {
_datum rrrr[_N];
_datum iiii[_N];
static const int N = _N;
typedef _datum datum;
};
template<int N,class datum>
inline accelerator GpuComplexVector<N,datum> operator*(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
GpuComplexVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]*r.rrrr[i] - l.iiii[i]*r.iiii[i];
ret.iiii[i] = l.rrrr[i]*r.iiii[i] + l.iiii[i]*r.rrrr[i];
}
return ret;
}
template<int N,class datum>
inline accelerator GpuComplexVector<N,datum> operator-(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
GpuComplexVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]-r.rrrr[i];
ret.iiii[i] = l.iiii[i]-r.iiii[i];
}
return ret;
}
template<int N,class datum>
inline accelerator GpuComplexVector<N,datum> operator+(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
GpuComplexVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]+r.rrrr[i];
ret.iiii[i] = l.iiii[i]+r.iiii[i];
}
return ret;
}
template<int N,class datum>
inline accelerator GpuComplexVector<N,datum> operator/(const GpuComplexVector<N,datum> l,const GpuComplexVector<N,datum> r) {
GpuComplexVector<N,datum> ret;
for(int i=0;i<N;i++) {
ret.rrrr[i] = l.rrrr[i]/r.rrrr[i];
ret.iiii[i] = l.iiii[i]/r.iiii[i];
}
return ret;
}
////////////////////////////////
// SIMD counts
////////////////////////////////
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half);
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float);
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double);
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
typedef GpuVector<NSIMD_RealH , half > GpuVectorRH;
typedef GpuComplexVector<NSIMD_ComplexH, half > GpuVectorCH;
typedef GpuVector<NSIMD_RealF, float > GpuVectorRF;
typedef GpuComplexVector<NSIMD_ComplexF, float> GpuVectorCF;
typedef GpuVector<NSIMD_RealD, double > GpuVectorRD;
typedef GpuComplexVector<NSIMD_ComplexD,double> GpuVectorCD;
typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
namespace Optimization {
struct Vsplat{
//Complex float
accelerator_inline GpuVectorCF operator()(float a, float b){
GpuVectorCF ret;
for(int i=0;i<GpuVectorCF::N;i++){
ret.rrrr[i] = typename GpuVectorCF::datum(a);
ret.iiii[i] = typename GpuVectorCF::datum(b);
}
return ret;
}
// Real float
accelerator_inline GpuVectorRF operator()(float a){
GpuVectorRF ret;
for(int i=0;i<GpuVectorRF::N;i++){
ret.rrrr[i] = typename GpuVectorRF::datum(a);
}
return ret;
}
//Complex double
accelerator_inline GpuVectorCD operator()(double a, double b){
GpuVectorCD ret;
for(int i=0;i<GpuVectorCD::N;i++){
ret.rrrr[i] = typename GpuVectorCD::datum(a);
ret.iiii[i] = typename GpuVectorCD::datum(b);
}
return ret;
}
//Real double
accelerator_inline GpuVectorRD operator()(double a){
GpuVectorRD ret;
for(int i=0;i<GpuVectorRD::N;i++){
ret.rrrr[i] = typename GpuVectorRD::datum(a);
}
return ret;
}
//Integer
accelerator_inline GpuVectorI operator()(Integer a){
GpuVectorI ret;
for(int i=0;i<GpuVectorI::N;i++){
ret.rrrr[i] = typename GpuVectorI::datum(a);
}
return ret;
}
};
struct Vstore{
template<int N,class datum,class P>
accelerator_inline void operator()(GpuVector<N,datum> a, P* Fp){
GpuVector<N,datum> *vF = (GpuVector<N,datum> *)Fp;
*vF = a;
}
template<int N,class datum,class P>
accelerator_inline void operator()(GpuComplexVector<N,datum> a, P* Fp){
GpuComplexVector<N,datum> *vF = (GpuComplexVector<N,datum> *)Fp;
*vF = a;
}
};
struct Vstream{
template<int N,class datum, class P>
accelerator_inline void operator()(P* F,GpuVector<N,datum> a){
GpuVector<N,datum> *vF = (GpuVector<N,datum> *)F;
*vF = a;
}
template<int N,class datum, class P>
accelerator_inline void operator()(P* F,GpuComplexVector<N,datum> a){
GpuComplexVector<N,datum> *vF = (GpuComplexVector<N,datum> *)F;
*vF = a;
}
};
struct Vset{
// Complex float
accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a){
typedef GpuVectorCF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = vec::datum(a[i].real());
ret.iiii[i] = vec::datum(a[i].imag());
}
return ret;
}
// Complex double
accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a){
typedef GpuVectorCD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = vec::datum(a[i].real());
ret.iiii[i] = vec::datum(a[i].imag());
}
return ret;
}
// Real float
accelerator_inline GpuVectorRF operator()(float *a){
typedef GpuVectorRF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = vec::datum(a[i]);
}
return ret;
}
// Real double
accelerator_inline GpuVectorRD operator()(double *a){
typedef GpuVectorRD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = vec::datum(a[i]);
}
return ret;
}
// Integer
accelerator_inline GpuVectorI operator()(Integer *a){
typedef GpuVectorI vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = vec::datum(a[i]);
}
return ret;
}
};
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
accelerator_inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Real float
accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){
return a+b;
}
accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){
return a+b;
}
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
return a+b;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
return a+b;
}
accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){
return a+b;
}
};
struct Sub{
accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){
return a-b;
}
accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){
return a-b;
}
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
return a-b;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
return a-b;
}
accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){
return a-b;
}
};
struct MultRealPart{
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
typedef GpuVectorCF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = a.rrrr[i]*b.rrrr[i];
ret.iiii[i] = a.rrrr[i]*b.iiii[i];
}
return ret;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
typedef GpuVectorCD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = a.rrrr[i]*b.rrrr[i];
ret.iiii[i] = a.rrrr[i]*b.iiii[i];
}
return ret;
}
};
struct MaddRealPart{
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b,GpuVectorCF c){
typedef GpuVectorCF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = a.rrrr[i]*b.rrrr[i]+c.rrrr[i];
ret.iiii[i] = a.rrrr[i]*b.iiii[i]+c.iiii[i];
}
return ret;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b,GpuVectorCD c){
typedef GpuVectorCD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = a.rrrr[i]*b.rrrr[i]+c.rrrr[i];
ret.iiii[i] = a.rrrr[i]*b.iiii[i]+c.iiii[i];
}
return ret;
}
};
struct MultComplex{
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
return a*b;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
return a*b;
}
};
struct Mult{
accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c){
a= a+b*c;
}
accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c){
a= a+b*c;
}
// Real float
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b){
return a*b;
}
// Real double
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
return a*b;
}
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
return a*b;
}
};
struct Div{
// Real float
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b){
return a/b;
}
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
return a/b;
}
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
return a/b;
}
// Danger -- element wise divide fro complex, not complex div.
// See Grid_vector_types.h lines around 735, applied after "toReal"
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
return a/b;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b){
return a/b;
}
};
struct Conj{
// Complex single
accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
typedef GpuVectorCF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = in.rrrr[i];
ret.iiii[i] =-in.iiii[i];
}
return ret;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
typedef GpuVectorCD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = in.rrrr[i];
ret.iiii[i] =-in.iiii[i];
}
return ret;
}
};
struct TimesMinusI{
//Complex single
accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
typedef GpuVectorCF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = in.iiii[i];
ret.iiii[i] =-in.rrrr[i];
}
return ret;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
typedef GpuVectorCD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] = in.iiii[i];
ret.iiii[i] =-in.rrrr[i];
}
return ret;
}
};
struct TimesI{
//Complex single
accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
typedef GpuVectorCF vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] =-in.iiii[i];
ret.iiii[i] = in.rrrr[i];
}
return ret;
}
accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
typedef GpuVectorCD vec;
vec ret;
for(int i=0;i<vec::N;i++){
ret.rrrr[i] =-in.iiii[i];
ret.iiii[i] = in.rrrr[i];
}
return ret;
}
};
struct Permute{
template <int n,int _N, class _datum >
static accelerator_inline GpuVector<_N,_datum> PermuteN(GpuVector<_N,_datum> &in) {
typedef GpuVector<_N,_datum> vec;
vec out;
unsigned int _mask = vec::N >> (n + 1);
for(int i=0;i<vec::N;i++) {
out.rrrr[i] = in.rrrr[i^_mask];
}
return out;
}
template <int n,int _N, class _datum >
static accelerator_inline GpuComplexVector<_N,_datum> PermuteN(GpuComplexVector<_N,_datum> &in) {
typedef GpuComplexVector<_N,_datum> vec;
vec out;
unsigned int _mask = vec::N >> (n + 1);
for(int i=0;i<vec::N;i++) {
out.rrrr[i] = in.rrrr[i^_mask];
out.iiii[i] = in.iiii[i^_mask];
}
return out;
}
template <typename vec> static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec::N,typename vec::datum>(in); }
template <typename vec> static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec::N,typename vec::datum>(in); }
template <typename vec> static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec::N,typename vec::datum>(in); }
template <typename vec> static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec::N,typename vec::datum>(in); }
};
struct PrecisionChange {
////////////////////////////////////////////////////////////////////////////////////
// Single / Half
////////////////////////////////////////////////////////////////////////////////////
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
int N = GpuVectorCF::N;
GpuVectorCH h;
for(int i=0;i<N;i++) {
h.rrrr[i ] = float2half(a.rrrr[i]);
h.iiii[i ] = float2half(a.iiii[i]);
h.rrrr[i+N] = float2half(b.rrrr[i]);
h.iiii[i+N] = float2half(b.iiii[i]);
}
return h;
}
static accelerator_inline void HtoS (GpuVectorCH h,GpuVectorCF &sa,GpuVectorCF &sb) {
int N = GpuVectorCF::N;
for(int i=0;i<N;i++) {
sa.rrrr[i] = half2float(h.rrrr[i ]);
sa.iiii[i] = half2float(h.iiii[i ]);
sb.rrrr[i] = half2float(h.rrrr[i+N]);
sb.iiii[i] = half2float(h.iiii[i+N]);
}
}
static accelerator_inline GpuVectorRH StoH (GpuVectorRF a,GpuVectorRF b) {
int N = GpuVectorRF::N;
GpuVectorRH h;
for(int i=0;i<N;i++) {
h.rrrr[i ] = float2half(a.rrrr[i]);
h.rrrr[i+N] = float2half(b.rrrr[i]);
}
return h;
}
static accelerator_inline void HtoS (GpuVectorRH h,GpuVectorRF &sa,GpuVectorRF &sb) {
int N = GpuVectorRF::N;
for(int i=0;i<N;i++) {
sa.rrrr[i] = half2float(h.rrrr[i ]);
sb.rrrr[i] = half2float(h.rrrr[i+N]);
}
}
////////////////////////////////////////////////////////////////////////////////////
// Double Single
////////////////////////////////////////////////////////////////////////////////////
static accelerator_inline GpuVectorCF DtoS (GpuVectorCD a,GpuVectorCD b) {
int N = GpuVectorCD::N;
GpuVectorCF h;
for(int i=0;i<N;i++) {
h.rrrr[i ] = a.rrrr[i];
h.iiii[i ] = a.iiii[i];
h.rrrr[i+N] = b.rrrr[i];
h.iiii[i+N] = b.iiii[i];
}
return h;
}
static accelerator_inline void StoD (GpuVectorCF h,GpuVectorCD &sa,GpuVectorCD &sb) {
int N = GpuVectorCD::N;
for(int i=0;i<N;i++) {
sa.rrrr[i] = h.rrrr[i ];
sa.iiii[i] = h.iiii[i ];
sb.rrrr[i] = h.rrrr[i+N];
sb.iiii[i] = h.iiii[i+N];
}
}
static accelerator_inline GpuVectorRF DtoS (GpuVectorRD a,GpuVectorRD b) {
int N = GpuVectorRD::N;
GpuVectorRF h;
for(int i=0;i<N;i++) {
h.rrrr[i ] = a.rrrr[i];
h.rrrr[i+N] = b.rrrr[i];
}
return h;
}
static accelerator_inline void StoD (GpuVectorRF h,GpuVectorRD &sa,GpuVectorRD &sb) {
int N = GpuVectorRD::N;
for(int i=0;i<N;i++) {
sa.rrrr[i] = h.rrrr[i ];
sb.rrrr[i] = h.rrrr[i+N];
}
}
////////////////////////////////////////////////////////////////////////////////////
// Double Half
////////////////////////////////////////////////////////////////////////////////////
static accelerator_inline GpuVectorCH DtoH (GpuVectorCD a,GpuVectorCD b,GpuVectorCD c,GpuVectorCD d) {
GpuVectorCF sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
}
static accelerator_inline void HtoD (GpuVectorCH h,GpuVectorCD &a,GpuVectorCD &b,GpuVectorCD &c,GpuVectorCD &d) {
GpuVectorCF sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
}
static accelerator_inline GpuVectorRH DtoH (GpuVectorRD a,GpuVectorRD b,GpuVectorRD c,GpuVectorRD d) {
GpuVectorRF sa,sb;
sa = DtoS(a,b);
sb = DtoS(c,d);
return StoH(sa,sb);
}
static accelerator_inline void HtoD (GpuVectorRH h,GpuVectorRD &a,GpuVectorRD &b,GpuVectorRD &c,GpuVectorRD &d) {
GpuVectorRF sa,sb;
HtoS(h,sa,sb);
StoD(sa,a,b);
StoD(sb,c,d);
}
};
struct Exchange{
template <int n,int _N, class _datum >
static accelerator_inline void ExchangeN(GpuVector<_N,_datum> &out1,
GpuVector<_N,_datum> &out2,
GpuVector<_N,_datum> &in1,
GpuVector<_N,_datum> &in2 )
{
typedef GpuVector<_N,_datum> vec;
unsigned int mask = vec::N >> (n + 1);
for(int i=0;i<vec::N;i++) {
int j1 = i&(~mask);
if ( (i&mask) == 0 ) { out1.rrrr[i]=in1.rrrr[j1];}
else { out1.rrrr[i]=in2.rrrr[j1];}
int j2 = i|mask;
if ( (i&mask) == 0 ) { out2.rrrr[i]=in1.rrrr[j2];}
else { out2.rrrr[i]=in2.rrrr[j2];}
}
}
template <int n,int _N, class _datum >
static accelerator_inline void ExchangeN(GpuComplexVector<_N,_datum> &out1,
GpuComplexVector<_N,_datum> &out2,
GpuComplexVector<_N,_datum> &in1,
GpuComplexVector<_N,_datum> &in2 )
{
typedef GpuComplexVector<_N,_datum> vec;
unsigned int mask = vec::N >> (n + 1);
for(int i=0;i<vec::N;i++) {
int j1 = i&(~mask);
if ( (i&mask) == 0 ) {
out1.rrrr[i]=in1.rrrr[j1];
out1.iiii[i]=in1.iiii[j1];
}
else {
out1.rrrr[i]=in2.rrrr[j1];
out1.iiii[i]=in2.iiii[j1];
}
int j2 = i|mask;
if ( (i&mask) == 0 ) {
out2.rrrr[i]=in1.rrrr[j2];
out2.iiii[i]=in1.iiii[j2];
}
else {
out2.rrrr[i]=in2.rrrr[j2];
out2.iiii[i]=in2.iiii[j2];
}
}
}
template <typename vec>
static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){
ExchangeN<0>(out1,out2,in1,in2);
};
template <typename vec>
static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){
ExchangeN<1>(out1,out2,in1,in2);
};
template <typename vec>
static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){
ExchangeN<2>(out1,out2,in1,in2);
};
template <typename vec>
static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){
ExchangeN<3>(out1,out2,in1,in2);
};
};
struct Rotate{
template <int n, typename vec> static accelerator_inline vec tRotate(vec in){
return rotate(in, n);
}
template <int _N, class _datum >
static accelerator_inline GpuComplexVector<_N,_datum> rotate_template(GpuComplexVector<_N,_datum> &in, int n)
{
typedef GpuComplexVector<_N,_datum> vec;
vec out;
for(int i=0;i<vec::N;i++){
out.rrrr[i] = in.rrrr[(i + n)%vec::N];
out.iiii[i] = in.iiii[(i + n)%vec::N];
}
return out;
}
template <int _N, class _datum >
static accelerator_inline GpuVector<_N,_datum> rotate_template(GpuVector<_N,_datum> &in, int n)
{
typedef GpuVector<_N,_datum> vec;
vec out;
for(int i=0;i<vec::N;i++){
out.rrrr[i] = in.rrrr[(i + n)%vec::N];
}
return out;
}
typedef GpuVectorRH SIMD_Htype; // Single precision type
typedef GpuVectorRF SIMD_Ftype; // Single precision type
typedef GpuVectorRD SIMD_Dtype; // Double precision type
typedef GpuVectorI SIMD_Itype; // Integer type
typedef GpuVectorCH SIMD_CHtype; // Single precision type
typedef GpuVectorCF SIMD_CFtype; // Single precision type
typedef GpuVectorCD SIMD_CDtype; // Double precision type
static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n){ return rotate_template(in,n);}
static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n){ return rotate_template(in,n);}
static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n){ return rotate_template(in,n);}
static accelerator_inline GpuVectorI rotate(GpuVectorI in, int n){ return rotate_template(in,n);}
static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n){ return rotate_template(in,n/2);} // Measure in complex not float
static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n){ return rotate_template(in,n/2);}
static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n){ return rotate_template(in,n/2);}
};
//////////////////////////////////////////////
// Some Template specialization
//Complex float Reduce
template<>
accelerator_inline Grid::ComplexF
Reduce<Grid::ComplexF, GpuVectorCF>::operator()(GpuVectorCF in)
{
Grid::ComplexF greduce(in.rrrr[0],in.iiii[0]);
for(int i=1;i<GpuVectorCF::N;i++) {
greduce = greduce+Grid::ComplexF(in.rrrr[i],in.iiii[i]);
}
return greduce;
}
template<>
accelerator_inline Grid::ComplexD
Reduce<Grid::ComplexD, GpuVectorCD>::operator()(GpuVectorCD in)
{
Grid::ComplexD greduce(in.rrrr[0],in.iiii[0]);
for(int i=1;i<GpuVectorCD::N;i++) {
greduce = greduce+ Grid::ComplexD(in.rrrr[i],in.iiii[i]);
}
return greduce;
}
// Real
template<>
accelerator_inline Grid::RealF
Reduce<RealF, GpuVectorRF>::operator()(GpuVectorRF in)
{
RealF ret = in.rrrr[0];
for(int i=1;i<GpuVectorRF::N;i++) {
ret = ret+in.rrrr[i];
}
return ret;
}
template<>
accelerator_inline Grid::RealD
Reduce<RealD, GpuVectorRD>::operator()(GpuVectorRD in)
{
RealD ret = in.rrrr[0];
for(int i=1;i<GpuVectorRD::N;i++) {
ret = ret+in.rrrr[i];
}
return ret;
}
template<>
accelerator_inline Integer
Reduce<Integer, GpuVectorI>::operator()(GpuVectorI in)
{
Integer ret = in.rrrr[0];
for(int i=1;i<GpuVectorI::N;i++) {
ret = ret+in.rrrr[i];
}
return ret;
}
}// End optimizatoin
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
//////////////////////////////////////////////////////////////////////////////////////
typedef GpuVectorRH SIMD_Htype; // Single precision type
typedef GpuVectorRF SIMD_Ftype; // Single precision type
typedef GpuVectorRD SIMD_Dtype; // Double precision type
typedef GpuVectorI SIMD_Itype; // Integer type
typedef GpuVectorCH SIMD_CHtype; // Single precision type
typedef GpuVectorCF SIMD_CFtype; // Single precision type
typedef GpuVectorCD SIMD_CDtype; // Double precision type
// prefetch utilities
accelerator_inline void v_prefetch0(int size, const char *ptr){};
accelerator_inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
typedef Optimization::Vset VsetSIMD;
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}

View File

@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
#endif #endif
#ifdef GRID_SYCL #if !defined(GRID_CUDA) && !defined(GRID_HIP)
namespace Grid { namespace Grid {
typedef struct { uint16_t x;} half; typedef struct { uint16_t x;} half;
typedef struct { half x; half y;} half2; typedef struct { half x; half y;} half2;
@ -486,7 +486,7 @@ namespace Optimization {
struct TimesMinusI{ struct TimesMinusI{
//Complex single //Complex single
accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){ accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
typedef GpuVectorCF vec; typedef GpuVectorCF vec;
vec ret; vec ret;
for(int i=0;i<vec::N;i++){ for(int i=0;i<vec::N;i++){
@ -495,7 +495,7 @@ namespace Optimization {
} }
return ret; return ret;
} }
accelerator_inline GpuVectorCD operator()(GpuVectorCD in,GpuVectorCD dummy){ accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
typedef GpuVectorCD vec; typedef GpuVectorCD vec;
vec ret; vec ret;
for(int i=0;i<vec::N;i++){ for(int i=0;i<vec::N;i++){
@ -508,7 +508,7 @@ namespace Optimization {
struct TimesI{ struct TimesI{
//Complex single //Complex single
accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){ accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
typedef GpuVectorCF vec; typedef GpuVectorCF vec;
vec ret; vec ret;
for(int i=0;i<vec::N;i++){ for(int i=0;i<vec::N;i++){
@ -517,7 +517,7 @@ namespace Optimization {
} }
return ret; return ret;
} }
accelerator_inline GpuVectorCD operator()(GpuVectorCD in,GpuVectorCD dummy){ accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
typedef GpuVectorCD vec; typedef GpuVectorCD vec;
vec ret; vec ret;
for(int i=0;i<vec::N;i++){ for(int i=0;i<vec::N;i++){

View File

@ -356,7 +356,7 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
//Complex double //Complex double
inline vector4double operator()(vector4double v, vector4double ret){ inline vector4double operator()(vector4double v){
return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.}, return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.},
(vector4double){0., 0., 0., 0.}); (vector4double){0., 0., 0., 0.});
} }
@ -367,7 +367,7 @@ struct TimesMinusI{
struct TimesI{ struct TimesI{
//Complex double //Complex double
inline vector4double operator()(vector4double v, vector4double ret){ inline vector4double operator()(vector4double v){
return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.}, return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.},
(vector4double){0., 0., 0., 0.}); (vector4double){0., 0., 0., 0.});
} }

View File

@ -35,7 +35,7 @@ Author: neo <cossu@post.kek.jp>
*/ */
// Time-stamp: <2015-06-16 23:27:54 neo> // Time-stamp: <2015-06-16 23:27:54 neo>
//---------------------------------------------------------------------- //----------------------------------------------------------------------
#include <immintrin.h>
#include <pmmintrin.h> #include <pmmintrin.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -273,27 +273,25 @@ struct Conj{
struct TimesMinusI{ struct TimesMinusI{
//Complex single //Complex single
inline __m128 operator()(__m128 in, __m128 ret){ inline __m128 operator()(__m128 in){
__m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i __m128 tmp =_mm_addsub_ps(_mm_setzero_ps(),in); // r,-i
return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); return _mm_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1));
} }
//Complex double //Complex double
inline __m128d operator()(__m128d in, __m128d ret){ inline __m128d operator()(__m128d in){
__m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i __m128d tmp =_mm_addsub_pd(_mm_setzero_pd(),in); // r,-i
return _mm_shuffle_pd(tmp,tmp,0x1); return _mm_shuffle_pd(tmp,tmp,0x1);
} }
}; };
struct TimesI{ struct TimesI{
//Complex single //Complex single
inline __m128 operator()(__m128 in, __m128 ret){ inline __m128 operator()(__m128 in){
__m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); __m128 tmp =_mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i return _mm_addsub_ps(_mm_setzero_ps(),tmp); // r,-i
} }
//Complex double //Complex double
inline __m128d operator()(__m128d in, __m128d ret){ inline __m128d operator()(__m128d in){
__m128d tmp = _mm_shuffle_pd(in,in,0x1); __m128d tmp = _mm_shuffle_pd(in,in,0x1);
return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i return _mm_addsub_pd(_mm_setzero_pd(),tmp); // r,-i
} }

View File

@ -110,11 +110,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
#ifdef GPU_VEC #ifdef GPU_VEC
#include "Grid_gpu_vec.h" #include "Grid_gpu_vec.h"
#endif #endif
/*
#ifdef GEN #ifdef GPU_RRII
#include "Grid_generic.h" #include "Grid_gpu_rrii.h"
#endif #endif
*/
#ifdef GEN #ifdef GEN
#if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here #if defined(A64FX) || defined(A64FXFIXEDSIZE) // breakout A64FX SVE ACLE here
@ -131,7 +130,6 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
#include "Grid_a64fx-fixedsize.h" #include "Grid_a64fx-fixedsize.h"
#endif #endif
#else #else
//#pragma message("building GEN") // generic
#include "Grid_generic.h" #include "Grid_generic.h"
#endif #endif
#endif #endif
@ -150,23 +148,6 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
#endif #endif
#endif #endif
/*
#ifdef A64FXVLA
#pragma message("building A64FX VLA")
#if defined(ARMCLANGCOMPAT)
#pragma message("applying data types patch")
#endif
#include <arm_sve.h>
#include "Grid_a64fx-2.h"
#endif
#ifdef A64FXVLS
#pragma message("building A64FX VLS")
#include <arm_sve.h>
#include "Grid_a64fx-fixedsize.h"
#endif
*/
#ifdef SSE4 #ifdef SSE4
#include "Grid_sse4.h" #include "Grid_sse4.h"
#endif #endif
@ -270,12 +251,14 @@ public:
typedef Vector_type vector_type; typedef Vector_type vector_type;
typedef Scalar_type scalar_type; typedef Scalar_type scalar_type;
/*
typedef union conv_t_union { typedef union conv_t_union {
Vector_type v; Vector_type v;
Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)]; Scalar_type s[sizeof(Vector_type) / sizeof(Scalar_type)];
accelerator_inline conv_t_union(){}; accelerator_inline conv_t_union(){};
} conv_t; } conv_t;
*/
Vector_type v; Vector_type v;
static accelerator_inline constexpr int Nsimd(void) { static accelerator_inline constexpr int Nsimd(void) {
@ -555,15 +538,13 @@ public:
template <class functor> template <class functor>
friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) {
Grid_simd ret; Grid_simd ret;
Grid_simd::conv_t conv;
Grid_simd::scalar_type s; Grid_simd::scalar_type s;
conv.v = v.v;
for (int i = 0; i < Nsimd(); i++) { for (int i = 0; i < Nsimd(); i++) {
s = conv.s[i]; s = v.getlane(i);
conv.s[i] = func(s); s = func(s);
ret.putlane(s,i);
} }
ret.v = conv.v;
return ret; return ret;
} }
template <class functor> template <class functor>
@ -571,18 +552,14 @@ public:
const Grid_simd &x, const Grid_simd &x,
const Grid_simd &y) { const Grid_simd &y) {
Grid_simd ret; Grid_simd ret;
Grid_simd::conv_t cx;
Grid_simd::conv_t cy;
Grid_simd::scalar_type sx,sy; Grid_simd::scalar_type sx,sy;
cx.v = x.v;
cy.v = y.v;
for (int i = 0; i < Nsimd(); i++) { for (int i = 0; i < Nsimd(); i++) {
sx = cx.s[i]; sx = x.getlane(i);
sy = cy.s[i]; sy = y.getlane(i);
cx.s[i] = func(sx,sy); sx = func(sx,sy);
ret.putlane(sx,i);
} }
ret.v = cx.v;
return ret; return ret;
} }
/////////////////////// ///////////////////////
@ -645,15 +622,36 @@ public:
/////////////////////////////// ///////////////////////////////
// Getting single lanes // Getting single lanes
/////////////////////////////// ///////////////////////////////
accelerator_inline Scalar_type getlane(int lane) { #ifdef GPU_RRII
template <class S = Scalar_type,IfComplex<S> = 0>
accelerator_inline Scalar_type getlane(int lane) const {
return Scalar_type(v.rrrr[lane],v.iiii[lane]);
}
template <class S = Scalar_type,IfComplex<S> = 0>
accelerator_inline void putlane(const Scalar_type &_S, int lane){
v.rrrr[lane] = real(_S);
v.iiii[lane] = imag(_S);
}
template <class S = Scalar_type,IfNotComplex<S> = 0>
accelerator_inline Scalar_type getlane(int lane) const {
return ((S*)&v)[lane];
}
template <class S = Scalar_type,IfNotComplex<S> = 0>
accelerator_inline void putlane(const S &_S, int lane){
((Scalar_type*)&v)[lane] = _S;
}
#else // Can pun to an array of complex
accelerator_inline Scalar_type getlane(int lane) const {
return ((Scalar_type*)&v)[lane]; return ((Scalar_type*)&v)[lane];
} }
accelerator_inline void putlane(const Scalar_type &S, int lane){ accelerator_inline void putlane(const Scalar_type &S, int lane){
((Scalar_type*)&v)[lane] = S; ((Scalar_type*)&v)[lane] = S;
} }
#endif
}; // end of Grid_simd class definition }; // end of Grid_simd class definition
/////////////////////////////// ///////////////////////////////
// Define available types // Define available types
/////////////////////////////// ///////////////////////////////
@ -663,7 +661,7 @@ typedef Grid_simd<double , SIMD_Dtype> vRealD;
typedef Grid_simd<Integer, SIMD_Itype> vInteger; typedef Grid_simd<Integer, SIMD_Itype> vInteger;
typedef Grid_simd<uint16_t,SIMD_Htype> vRealH; typedef Grid_simd<uint16_t,SIMD_Htype> vRealH;
#ifdef GPU_VEC #if defined(GPU_VEC) || defined(GPU_RRII)
typedef Grid_simd<complex<uint16_t>, SIMD_CHtype> vComplexH; typedef Grid_simd<complex<uint16_t>, SIMD_CHtype> vComplexH;
typedef Grid_simd<complex<float> , SIMD_CFtype> vComplexF; typedef Grid_simd<complex<float> , SIMD_CFtype> vComplexF;
typedef Grid_simd<complex<double> , SIMD_CDtype> vComplexD; typedef Grid_simd<complex<double> , SIMD_CDtype> vComplexD;
@ -763,6 +761,7 @@ accelerator_inline void vsplat(Grid_simd<S, V> &ret, NotEnableIf<is_complex<S>,
} }
////////////////////////// //////////////////////////
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Initialise to 1,0,i for the correct types // Initialise to 1,0,i for the correct types
/////////////////////////////////////////////// ///////////////////////////////////////////////
@ -907,34 +906,6 @@ accelerator_inline Grid_simd<S, V> fxmac(Grid_simd<S, V> a, Grid_simd<S, V> b, G
// ---------------------------------------------- // ----------------------------------------------
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
typedef Grid_simd<S, V> simd;
simd ret;
simd den;
typename simd::conv_t conv;
ret = a * conjugate(b) ;
den = b * conjugate(b) ;
// duplicates real part
auto real_den = toReal(den);
simd zden;
memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden));
ret.v=binary<V>(ret.v, zden.v, DivSIMD());
return ret;
};
// Real/Integer types
template <class S, class V, IfNotComplex<S> = 0>
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, DivSIMD());
return ret;
};
/////////////////////// ///////////////////////
// Conjugate // Conjugate
/////////////////////// ///////////////////////
@ -959,30 +930,29 @@ accelerator_inline Grid_simd<S, V> adj(const Grid_simd<S, V> &in) {
/////////////////////// ///////////////////////
template <class S, class V, IfComplex<S> = 0> template <class S, class V, IfComplex<S> = 0>
accelerator_inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) { accelerator_inline void timesMinusI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
ret.v = binary<V>(in.v, ret.v, TimesMinusISIMD()); ret.v = unary<V>(in.v, TimesMinusISIMD());
} }
template <class S, class V, IfComplex<S> = 0> template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) { accelerator_inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
Grid_simd<S, V> ret; Grid_simd<S, V> ret;
timesMinusI(ret, in); ret.v=unary<V>(in.v, TimesMinusISIMD());
return ret; return ret;
} }
template <class S, class V, IfNotComplex<S> = 0> template <class S, class V, IfNotComplex<S> = 0>
accelerator_inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) { accelerator_inline Grid_simd<S, V> timesMinusI(const Grid_simd<S, V> &in) {
return in; return in;
} }
/////////////////////// ///////////////////////
// timesI // timesI
/////////////////////// ///////////////////////
template <class S, class V, IfComplex<S> = 0> template <class S, class V, IfComplex<S> = 0>
accelerator_inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) { accelerator_inline void timesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in) {
ret.v = binary<V>(in.v, ret.v, TimesISIMD()); ret.v = unary<V>(in.v, TimesISIMD());
} }
template <class S, class V, IfComplex<S> = 0> template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) { accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
Grid_simd<S, V> ret; Grid_simd<S, V> ret;
timesI(ret, in); ret.v= unary<V>(in.v, TimesISIMD());
return ret; return ret;
} }
template <class S, class V, IfNotComplex<S> = 0> template <class S, class V, IfNotComplex<S> = 0>
@ -990,6 +960,35 @@ accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
return in; return in;
} }
// Distinguish between complex types and others
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
typedef Grid_simd<S, V> simd;
simd ret;
simd den;
ret = a * conjugate(b) ;
den = b * conjugate(b) ;
// duplicates real part
auto real_den = toReal(den);
simd zden;
memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden));
ret.v=binary<V>(ret.v, zden.v, DivSIMD());
return ret;
};
// Real/Integer types
template <class S, class V, IfNotComplex<S> = 0>
accelerator_inline Grid_simd<S, V> operator/(Grid_simd<S, V> a, Grid_simd<S, V> b) {
Grid_simd<S, V> ret;
ret.v = binary<V>(a.v, b.v, DivSIMD());
return ret;
};
///////////////////// /////////////////////
// Inner, outer // Inner, outer
///////////////////// /////////////////////
@ -1021,12 +1020,12 @@ template <class Csimd> // must be a real arg
accelerator_inline typename toRealMapper<Csimd>::Realified toReal(const Csimd &in) { accelerator_inline typename toRealMapper<Csimd>::Realified toReal(const Csimd &in) {
typedef typename toRealMapper<Csimd>::Realified Rsimd; typedef typename toRealMapper<Csimd>::Realified Rsimd;
Rsimd ret; Rsimd ret;
typename Rsimd::conv_t conv; int j=0;
memcpy((void *)&conv.v,(void *)&in.v,sizeof(conv.v));
for (int i = 0; i < Rsimd::Nsimd(); i += 2) { for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... auto s = real(in.getlane(j++));
ret.putlane(s,i);
ret.putlane(s,i+1);
} }
memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v));
return ret; return ret;
} }
@ -1039,23 +1038,24 @@ template <class Rsimd> // must be a real arg
accelerator_inline typename toComplexMapper<Rsimd>::Complexified toComplex(const Rsimd &in) { accelerator_inline typename toComplexMapper<Rsimd>::Complexified toComplex(const Rsimd &in) {
typedef typename toComplexMapper<Rsimd>::Complexified Csimd; typedef typename toComplexMapper<Rsimd>::Complexified Csimd;
typename Rsimd::conv_t conv; // address as real typedef typename Csimd::scalar_type scalar_type;
int j=0;
conv.v = in.v; Csimd ret;
for (int i = 0; i < Rsimd::Nsimd(); i += 2) { for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
assert(conv.s[i + 1] == conv.s[i]); auto rr = in.getlane(i);
auto ri = in.getlane(i+1);
assert(rr==ri);
// trap any cases where real was not duplicated // trap any cases where real was not duplicated
// indicating the SIMD grids of real and imag assignment did not correctly // indicating the SIMD grids of real and imag assignment did not correctly
// match // match
conv.s[i + 1] = 0.0; // zero imaginary parts scalar_type s(rr,0.0);
ret.putlane(s,j++);
} }
Csimd ret;
memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v));
return ret; return ret;
} }
accelerator_inline void precisionChange(vRealF *out,vRealD *in,int nvec) accelerator_inline void precisionChange(vRealF *out,const vRealD *in,int nvec)
{ {
assert((nvec&0x1)==0); assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){ for(int m=0;m*2<nvec;m++){
@ -1063,7 +1063,7 @@ accelerator_inline void precisionChange(vRealF *out,vRealD *in,int nvec)
out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v); out[m].v=Optimization::PrecisionChange::DtoS(in[n].v,in[n+1].v);
} }
} }
accelerator_inline void precisionChange(vRealH *out,vRealD *in,int nvec) accelerator_inline void precisionChange(vRealH *out,const vRealD *in,int nvec)
{ {
assert((nvec&0x3)==0); assert((nvec&0x3)==0);
for(int m=0;m*4<nvec;m++){ for(int m=0;m*4<nvec;m++){
@ -1071,7 +1071,7 @@ accelerator_inline void precisionChange(vRealH *out,vRealD *in,int nvec)
out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v); out[m].v=Optimization::PrecisionChange::DtoH(in[n].v,in[n+1].v,in[n+2].v,in[n+3].v);
} }
} }
accelerator_inline void precisionChange(vRealH *out,vRealF *in,int nvec) accelerator_inline void precisionChange(vRealH *out,const vRealF *in,int nvec)
{ {
assert((nvec&0x1)==0); assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){ for(int m=0;m*2<nvec;m++){
@ -1079,7 +1079,7 @@ accelerator_inline void precisionChange(vRealH *out,vRealF *in,int nvec)
out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v); out[m].v=Optimization::PrecisionChange::StoH(in[n].v,in[n+1].v);
} }
} }
accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec) accelerator_inline void precisionChange(vRealD *out,const vRealF *in,int nvec)
{ {
assert((nvec&0x1)==0); assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){ for(int m=0;m*2<nvec;m++){
@ -1095,7 +1095,7 @@ accelerator_inline void precisionChange(vRealD *out,vRealF *in,int nvec)
// | ~~~~~~~^ // | ~~~~~~~^
} }
} }
accelerator_inline void precisionChange(vRealD *out,vRealH *in,int nvec) accelerator_inline void precisionChange(vRealD *out,const vRealH *in,int nvec)
{ {
assert((nvec&0x3)==0); assert((nvec&0x3)==0);
for(int m=0;m*4<nvec;m++){ for(int m=0;m*4<nvec;m++){
@ -1103,7 +1103,7 @@ accelerator_inline void precisionChange(vRealD *out,vRealH *in,int nvec)
Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v); Optimization::PrecisionChange::HtoD(in[m].v,out[n].v,out[n+1].v,out[n+2].v,out[n+3].v);
} }
} }
accelerator_inline void precisionChange(vRealF *out,vRealH *in,int nvec) accelerator_inline void precisionChange(vRealF *out,const vRealH *in,int nvec)
{ {
assert((nvec&0x1)==0); assert((nvec&0x1)==0);
for(int m=0;m*2<nvec;m++){ for(int m=0;m*2<nvec;m++){
@ -1111,12 +1111,12 @@ accelerator_inline void precisionChange(vRealF *out,vRealH *in,int nvec)
Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v); Optimization::PrecisionChange::HtoS(in[m].v,out[n].v,out[n+1].v);
} }
} }
accelerator_inline void precisionChange(vComplexF *out,vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);} accelerator_inline void precisionChange(vComplexF *out,const vComplexD *in,int nvec){ precisionChange((vRealF *)out,(vRealD *)in,nvec);}
accelerator_inline void precisionChange(vComplexH *out,vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);} accelerator_inline void precisionChange(vComplexH *out,const vComplexD *in,int nvec){ precisionChange((vRealH *)out,(vRealD *)in,nvec);}
accelerator_inline void precisionChange(vComplexH *out,vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);} accelerator_inline void precisionChange(vComplexH *out,const vComplexF *in,int nvec){ precisionChange((vRealH *)out,(vRealF *)in,nvec);}
accelerator_inline void precisionChange(vComplexD *out,vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);} accelerator_inline void precisionChange(vComplexD *out,const vComplexF *in,int nvec){ precisionChange((vRealD *)out,(vRealF *)in,nvec);}
accelerator_inline void precisionChange(vComplexD *out,vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);} accelerator_inline void precisionChange(vComplexD *out,const vComplexH *in,int nvec){ precisionChange((vRealD *)out,(vRealH *)in,nvec);}
accelerator_inline void precisionChange(vComplexF *out,vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);} accelerator_inline void precisionChange(vComplexF *out,const vComplexH *in,int nvec){ precisionChange((vRealF *)out,(vRealH *)in,nvec);}
// Check our vector types are of an appropriate size. // Check our vector types are of an appropriate size.
@ -1130,21 +1130,6 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
#endif #endif
#endif #endif
/////////////////////////////////////////
// Some traits to recognise the types
/////////////////////////////////////////
template <typename T>
struct is_simd : public std::false_type {};
template <> struct is_simd<vRealF> : public std::true_type {};
template <> struct is_simd<vRealD> : public std::true_type {};
template <> struct is_simd<vRealH> : public std::true_type {};
template <> struct is_simd<vComplexF> : public std::true_type {};
template <> struct is_simd<vComplexD> : public std::true_type {};
template <> struct is_simd<vComplexH> : public std::true_type {};
template <> struct is_simd<vInteger> : public std::true_type {};
template <typename T> using IfSimd = Invoke<std::enable_if<is_simd<T>::value, int> >;
template <typename T> using IfNotSimd = Invoke<std::enable_if<!is_simd<T>::value, unsigned> >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -29,8 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_VECTOR_UNOPS #pragma once
#define GRID_VECTOR_UNOPS
#include <cmath> #include <cmath>
@ -112,6 +111,9 @@ template <class scalar>
struct ImagFunctor { struct ImagFunctor {
accelerator scalar operator()(const scalar &a) const { return imag(a); } accelerator scalar operator()(const scalar &a) const { return imag(a); }
}; };
/////////////
// Unary operations
/////////////
template <class S, class V> template <class S, class V>
accelerator_inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) { accelerator_inline Grid_simd<S, V> real(const Grid_simd<S, V> &r) {
return SimdApply(RealFunctor<S>(), r); return SimdApply(RealFunctor<S>(), r);
@ -168,6 +170,65 @@ template <class S, class V>
accelerator_inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) { accelerator_inline Grid_simd<S, V> div(const Grid_simd<S, V> &r, Integer y) {
return SimdApply(DivIntFunctor<S>(y), r); return SimdApply(DivIntFunctor<S>(y), r);
} }
/// Double 2 cases
template <class S, class V>
accelerator_inline Grid_simd2<S, V> real(const Grid_simd2<S, V> &r) {
return SimdApply(RealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> imag(const Grid_simd2<S, V> &r) {
return SimdApply(ImagFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> sqrt(const Grid_simd2<S, V> &r) {
return SimdApply(SqrtRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> cos(const Grid_simd2<S, V> &r) {
return SimdApply(CosRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> sin(const Grid_simd2<S, V> &r) {
return SimdApply(SinRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> acos(const Grid_simd2<S, V> &r) {
return SimdApply(AcosRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> asin(const Grid_simd2<S, V> &r) {
return SimdApply(AsinRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> log(const Grid_simd2<S, V> &r) {
return SimdApply(LogRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> abs(const Grid_simd2<S, V> &r) {
return SimdApply(AbsRealFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> exp(const Grid_simd2<S, V> &r) {
return SimdApply(ExpFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> Not(const Grid_simd2<S, V> &r) {
return SimdApply(NotFunctor<S>(), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> pow(const Grid_simd2<S, V> &r, double y) {
return SimdApply(PowRealFunctor<S>(y), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> mod(const Grid_simd2<S, V> &r, Integer y) {
return SimdApply(ModIntFunctor<S>(y), r);
}
template <class S, class V>
accelerator_inline Grid_simd2<S, V> div(const Grid_simd2<S, V> &r, Integer y) {
return SimdApply(DivIntFunctor<S>(y), r);
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Allows us to assign into **conformable** real vectors from complex // Allows us to assign into **conformable** real vectors from complex
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@ -193,23 +254,22 @@ struct OrOrFunctor {
//////////////////////////////// ////////////////////////////////
template <class S, class V> template <class S, class V>
accelerator_inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x, accelerator_inline Grid_simd<S, V> operator&(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) { const Grid_simd<S, V> &y) {
return SimdApplyBinop(AndFunctor<S>(), x, y); return SimdApplyBinop(AndFunctor<S>(), x, y);
} }
template <class S, class V> template <class S, class V>
accelerator_inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x, accelerator_inline Grid_simd<S, V> operator&&(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) { const Grid_simd<S, V> &y) {
return SimdApplyBinop(AndAndFunctor<S>(), x, y); return SimdApplyBinop(AndAndFunctor<S>(), x, y);
} }
template <class S, class V> template <class S, class V>
accelerator_inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x, accelerator_inline Grid_simd<S, V> operator|(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) { const Grid_simd<S, V> &y) {
return SimdApplyBinop(OrFunctor<S>(), x, y); return SimdApplyBinop(OrFunctor<S>(), x, y);
} }
template <class S, class V> template <class S, class V>
accelerator_inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x, accelerator_inline Grid_simd<S, V> operator||(const Grid_simd<S, V> &x,
const Grid_simd<S, V> &y) { const Grid_simd<S, V> &y) {
return SimdApplyBinop(OrOrFunctor<S>(), x, y); return SimdApplyBinop(OrOrFunctor<S>(), x, y);
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif

View File

@ -69,6 +69,7 @@ typedef RealF Real;
typedef thrust::complex<RealF> ComplexF; typedef thrust::complex<RealF> ComplexF;
typedef thrust::complex<RealD> ComplexD; typedef thrust::complex<RealD> ComplexD;
typedef thrust::complex<Real> Complex; typedef thrust::complex<Real> Complex;
typedef thrust::complex<uint16_t> ComplexH;
template<class T> using complex = thrust::complex<T>; template<class T> using complex = thrust::complex<T>;
accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(thrust::pow(r,(double)y)); } accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(thrust::pow(r,(double)y)); }
@ -77,6 +78,7 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(thrust::pow(r
typedef std::complex<RealF> ComplexF; typedef std::complex<RealF> ComplexF;
typedef std::complex<RealD> ComplexD; typedef std::complex<RealD> ComplexD;
typedef std::complex<Real> Complex; typedef std::complex<Real> Complex;
typedef std::complex<uint16_t> ComplexH; // Hack
template<class T> using complex = std::complex<T>; template<class T> using complex = std::complex<T>;
accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(std::pow(r,y)); } accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(std::pow(r,y)); }
@ -224,18 +226,14 @@ accelerator_inline void Gpermute(VectorSIMD &y,const VectorSIMD &b,int perm);
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#include <Grid/simd/Grid_vector_types.h> #include <Grid/simd/Grid_vector_types.h>
#include <Grid/simd/Grid_doubled_vector.h>
#include <Grid/simd/Grid_vector_unops.h> #include <Grid/simd/Grid_vector_unops.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// Default precision
#ifdef GRID_DEFAULT_PRECISION_DOUBLE // Default precision is wired to double
typedef vRealD vReal; typedef vRealD vReal;
typedef vComplexD vComplex; typedef vComplexD vComplex;
#else
typedef vRealF vReal;
typedef vComplexF vComplex;
#endif
inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){ inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
int nn=vComplexF::Nsimd(); int nn=vComplexF::Nsimd();
@ -262,6 +260,13 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
stream<<">"; stream<<">";
return stream; return stream;
} }
inline std::ostream& operator<< (std::ostream& stream, const vComplexD2 &o){
stream<<"<";
stream<<o.v[0];
stream<<o.v[1];
stream<<">";
return stream;
}
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){ inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
int nn=vRealF::Nsimd(); int nn=vRealF::Nsimd();

View File

@ -3,26 +3,108 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class vobj> class SimpleStencilParams{
accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type) public:
{ Coordinate dirichlet;
typedef decltype(coalescedRead(mp0)) sobj; int partialDirichlet;
unsigned int Nsimd = vobj::Nsimd(); SimpleStencilParams() { partialDirichlet = 0; };
unsigned int mask = Nsimd >> (type + 1); };
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one
const vobj *vpa = &vp0;
const vobj *vpb = &vp1;
const vobj *vp = (lane&mask) ? (vpb) : (vpa);
auto sa = coalescedRead(vp[0],j0);
auto sb = coalescedRead(vp[0],j1);
coalescedWrite(mp0,sa);
coalescedWrite(mp1,sb);
}
template<class vobj>
class SimpleCompressor { // Compressors will inherit buffer management policies
// Standard comms buffer management
class FaceGatherSimple
{
public:
static int PartialCompressionFactor(GridBase *grid) {return 1;};
// Decompress is after merge so ok
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
int num=table.size();
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_forNB( i,num, vobj::Nsimd(), {
compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
});
rhs_v.ViewClose();
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
auto rhs_p = &rhs_v[0];
accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0[j],p1[j],
rhs_p[so+tp[2*j ].second],
rhs_p[so+tp[2*j+1].second],
type);
});
rhs_v.ViewClose();
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
auto kp = dd.kernel_p;
auto mp = dd.mpi_p;
accelerator_forNB(o,dd.buffer_size,1,{
decompress.Decompress(kp[o],mp[o]);
});
}
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
auto mp = &mm.mpointer[0];
auto vp0= &mm.vpointers[0][0];
auto vp1= &mm.vpointers[1][0];
auto type= mm.type;
accelerator_forNB(o,mm.buffer_size/2,Merger::Nsimd,{
decompress.Exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
});
}
};
////////////////////////////////////
// Wilson compressor will add alternate policies for Dirichlet
// and possibly partial Dirichlet for DWF
////////////////////////////////////
/*
class FaceGatherDirichlet
{
// If it's dirichlet we don't assemble comms buffers
//
// Rely on zeroes in gauge field to drive the correct result
// NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){};
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type) {}
template<class decompressor,class Merger>
static void Merge(decompressor decompress,Merge &mm) { }
template<class decompressor,class Decompression>
static void Decompress(decompressor decompress,Decompression &dd) {}
};
*/
template<class vobj,class FaceGather>
class SimpleCompressorGather : public FaceGather {
public: public:
void Point(int) {}; void Point(int) {};
accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); }
@ -30,20 +112,19 @@ public:
accelerator_inline void Compress(vobj &buf,const vobj &in) const { accelerator_inline void Compress(vobj &buf,const vobj &in) const {
coalescedWrite(buf,coalescedRead(in)); coalescedWrite(buf,coalescedRead(in));
} }
accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { accelerator_inline void Exchange(vobj &mp0,vobj &mp1,vobj &vp0,vobj &vp1,Integer type) const {
#ifdef GRID_SIMT #ifdef GRID_SIMT
exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); exchangeSIMT(mp0,mp1,vp0,vp1,type);
#else #else
exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); exchange(mp0,mp1,vp0,vp1,type);
#endif #endif
} }
accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } accelerator_inline void Decompress(vobj &out,vobj &in) const { };
accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, accelerator_inline void CompressExchange(vobj &out0,vobj &out1,const vobj &in0,const vobj &in1,int type) const {
int j,int k, int m,int type) const {
#ifdef GRID_SIMT #ifdef GRID_SIMT
exchangeSIMT(out0[j],out1[j],in[k],in[m],type); exchangeSIMT(out0,out1,in0,in1,type);
#else #else
exchange(out0[j],out1[j],in[k],in[m],type); exchange(out0,out1,in0,in1,type);
#endif #endif
} }
// For cshift. Cshift should drop compressor coupling altogether // For cshift. Cshift should drop compressor coupling altogether
@ -52,11 +133,18 @@ public:
return arg; return arg;
} }
}; };
class SimpleStencilParams{
public: // Standard compressor never needs dirichlet.
Coordinate dirichlet; //
SimpleStencilParams() {}; // Get away with a local period wrap and rely on dirac operator to use a zero gauge link as it is faster
}; //
// Compressors that inherit Dirichlet and Non-dirichlet behaviour.
//
// Currently run-time behaviour through StencilParameters paramaters, p.dirichlet
// combined with the FaceGatherSimple behaviour
template <class vobj> using SimpleCompressor = SimpleCompressorGather<vobj,FaceGatherSimple>;
//template <class vobj> using SimpleCompressorDirichlet = SimpleCompressorGather<vobj,FaceGatherDirichlet>;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -29,6 +29,27 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
uint64_t DslashFullCount;
uint64_t DslashPartialCount;
uint64_t DslashDirichletCount;
void DslashResetCounts(void)
{
DslashFullCount=0;
DslashPartialCount=0;
DslashDirichletCount=0;
}
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
{
dirichlet = DslashDirichletCount;
partial = DslashPartialCount;
full = DslashFullCount;
}
void DslashLogFull(void) { DslashFullCount++;}
void DslashLogPartial(void) { DslashPartialCount++;}
void DslashLogDirichlet(void){ DslashDirichletCount++;}
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table) int off,std::vector<std::pair<int,int> > & table)
{ {

View File

@ -52,6 +52,16 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation
struct DefaultImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
DefaultImplParams() {
dirichlet.resize(0);
partialDirichlet=0;
};
};
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression // Gather for when there *is* need to SIMD split with compression
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -59,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table); int off,std::vector<std::pair<int,int> > & table);
/*
template<class vobj,class cobj,class compressor> template<class vobj,class cobj,class compressor>
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline));
@ -80,11 +91,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor> template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs, void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor> template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
Vector<cobj *> pointers,int dimension,int plane,int cbmask, const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type) compressor &compress,int type)
{ {
assert( (table.size()&0x1)==0); assert( (table.size()&0x1)==0);
@ -92,17 +106,25 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead); auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0]; auto p0=&pointers[0][0];
auto p1=&pointers[1][0]; auto p1=&pointers[1][0];
auto tp=&table[0]; auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), { accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, &rhs_v[0], j, compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second, so+tp[2*j ].second,
so+tp[2*j+1].second, so+tp[2*j+1].second,
type); type);
}); });
rhs_v.ViewClose(); rhs_v.ViewClose();
} }
*/
void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void);
void DslashLogPartial(void);
void DslashLogDirichlet(void);
struct StencilEntry { struct StencilEntry {
#ifdef GRID_CUDA #ifdef GRID_CUDA
@ -133,8 +155,18 @@ class CartesianStencilAccelerator {
int _osites; int _osites;
StencilVector _directions; StencilVector _directions;
StencilVector _distances; StencilVector _distances;
StencilVector _comms_send; ///////////////////////////////////////////////////
StencilVector _comms_recv; // If true, this is FULLY communicated per face
// Otherwise will either be full or partial dirichlet
///////////////////////////////////////////////////
StencilVector _comms_send;
StencilVector _comms_recv; // this is FULLY communicated per face
///////////////////////////////////////////////////
// If true, this is partially communicated per face
///////////////////////////////////////////////////
StencilVector _comms_partial_send;
StencilVector _comms_partial_recv;
//
StencilVector _comm_buf_size; StencilVector _comm_buf_size;
StencilVector _permute_type; StencilVector _permute_type;
StencilVector same_node; StencilVector same_node;
@ -181,7 +213,7 @@ class CartesianStencilAccelerator {
template<class vobj,class cobj,class Parameters> template<class vobj,class cobj,class Parameters>
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters> class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
{ {
private: public:
int *closed; int *closed;
StencilEntry *cpu_ptr; StencilEntry *cpu_ptr;
ViewMode mode; ViewMode mode;
@ -216,7 +248,6 @@ class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters
public: public:
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object; typedef typename cobj::scalar_object scalar_object;
typedef const CartesianStencilView<vobj,cobj,Parameters> View_type; typedef const CartesianStencilView<vobj,cobj,Parameters> View_type;
typedef typename View_type::StencilVector StencilVector; typedef typename View_type::StencilVector StencilVector;
@ -230,19 +261,26 @@ public:
Integer from_rank; Integer from_rank;
Integer do_send; Integer do_send;
Integer do_recv; Integer do_recv;
Integer bytes; Integer xbytes;
Integer rbytes;
}; };
struct Merge { struct Merge {
static constexpr int Nsimd = vobj::Nsimd();
cobj * mpointer; cobj * mpointer;
Vector<scalar_object *> rpointers; // std::vector<scalar_object *> rpointers;
Vector<cobj *> vpointers; std::vector<cobj *> vpointers;
Integer buffer_size; Integer buffer_size;
Integer type; Integer type;
Integer partial; // partial dirichlet BCs
Coordinate dims;
}; };
struct Decompress { struct Decompress {
static constexpr int Nsimd = vobj::Nsimd();
cobj * kernel_p; cobj * kernel_p;
cobj * mpi_p; cobj * mpi_p;
Integer buffer_size; Integer buffer_size;
Integer partial; // partial dirichlet BCs
Coordinate dims;
}; };
struct CopyReceiveBuffer { struct CopyReceiveBuffer {
void * from_p; void * from_p;
@ -253,7 +291,8 @@ public:
Integer direction; Integer direction;
Integer OrthogPlane; Integer OrthogPlane;
Integer DestProc; Integer DestProc;
Integer bytes; Integer xbytes;
Integer rbytes;
Integer lane; Integer lane;
Integer cb; Integer cb;
void *recv_buf; void *recv_buf;
@ -261,9 +300,9 @@ public:
protected: protected:
GridBase * _grid; GridBase * _grid;
public: public:
GridBase *Grid(void) const { return _grid; } GridBase *Grid(void) const { return _grid; }
LebesgueOrder *lo;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Needed to conveniently communicate gparity parameters into GPU memory // Needed to conveniently communicate gparity parameters into GPU memory
@ -278,6 +317,8 @@ public:
} }
int face_table_computed; int face_table_computed;
int partialDirichlet;
int fullDirichlet;
std::vector<commVector<std::pair<int,int> > > face_table ; std::vector<commVector<std::pair<int,int> > > face_table ;
Vector<int> surface_list; Vector<int> surface_list;
@ -307,6 +348,7 @@ public:
//////////////////////////////////////// ////////////////////////////////////////
// Stencil query // Stencil query
//////////////////////////////////////// ////////////////////////////////////////
#ifdef SHM_FAST_PATH
inline int SameNode(int point) { inline int SameNode(int point) {
int dimension = this->_directions[point]; int dimension = this->_directions[point];
@ -326,7 +368,40 @@ public:
if ( displacement == 0 ) return 1; if ( displacement == 0 ) return 1;
return 0; return 0;
} }
#else
// fancy calculation for shm code
inline int SameNode(int point) {
int dimension = this->_directions[point];
int displacement = this->_distances[point];
int pd = _grid->_processors[dimension];
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int recv_from_rank;
int xmit_to_rank;
if ( ! comm_dim ) return 1;
int nbr_proc;
if (displacement>0) nbr_proc = 1;
else nbr_proc = pd-1;
// FIXME this logic needs to be sorted for three link term
// assert( (displacement==1) || (displacement==-1));
// Present hack only works for >= 4^4 subvol per node
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p);
if ( shm==NULL ) return 0;
return 1;
}
#endif
////////////////////////////////////////// //////////////////////////////////////////
// Comms packet queue for asynch thread // Comms packet queue for asynch thread
// Use OpenMP Tasks for cleaner ??? // Use OpenMP Tasks for cleaner ???
@ -359,20 +434,25 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
accelerator_barrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send, Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].bytes,i); Packets[i].xbytes,Packets[i].rbytes,i);
} }
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
_grid->StencilSendToRecvFromComplete(MpiReqs,0); _grid->StencilSendToRecvFromComplete(MpiReqs,0);
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
acceleratorCopySynchronise();
// Everyone agrees we are all done
_grid->StencilBarrier();
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel. // Blocking send and receive. Either sequential or parallel.
@ -450,7 +530,6 @@ public:
{ {
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
// conformable(source.Grid(),_grid);
assert(source.Grid()==_grid); assert(source.Grid()==_grid);
u_comm_offset=0; u_comm_offset=0;
@ -502,7 +581,9 @@ public:
} }
} }
Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,Integer bytes,Integer cb) Integer CheckForDuplicate(Integer direction, Integer OrthogPlane, Integer DestProc, void *recv_buf,Integer lane,
Integer xbytes,Integer rbytes,
Integer cb)
{ {
CachedTransfer obj; CachedTransfer obj;
obj.direction = direction; obj.direction = direction;
@ -510,19 +591,22 @@ public:
obj.DestProc = DestProc; obj.DestProc = DestProc;
obj.recv_buf = recv_buf; obj.recv_buf = recv_buf;
obj.lane = lane; obj.lane = lane;
obj.bytes = bytes; obj.xbytes = xbytes;
obj.rbytes = rbytes;
obj.cb = cb; obj.cb = cb;
for(int i=0;i<CachedTransfers.size();i++){ for(int i=0;i<CachedTransfers.size();i++){
if ( (CachedTransfers[i].direction ==direction) if ( (CachedTransfers[i].direction ==direction)
&&(CachedTransfers[i].OrthogPlane==OrthogPlane) &&(CachedTransfers[i].OrthogPlane==OrthogPlane)
&&(CachedTransfers[i].DestProc ==DestProc) &&(CachedTransfers[i].DestProc ==DestProc)
&&(CachedTransfers[i].bytes ==bytes) &&(CachedTransfers[i].xbytes ==xbytes)
&&(CachedTransfers[i].rbytes ==rbytes)
&&(CachedTransfers[i].lane ==lane) &&(CachedTransfers[i].lane ==lane)
&&(CachedTransfers[i].cb ==cb) &&(CachedTransfers[i].cb ==cb)
){ ){
// FIXME worry about duplicate with partial compression
AddCopy(CachedTransfers[i].recv_buf,recv_buf,bytes); // Wont happen as DWF has no duplicates, but...
AddCopy(CachedTransfers[i].recv_buf,recv_buf,rbytes);
return 1; return 1;
} }
} }
@ -533,7 +617,7 @@ public:
void AddPacket(void *xmit,void * rcv, void AddPacket(void *xmit,void * rcv,
Integer to, Integer do_send, Integer to, Integer do_send,
Integer from, Integer do_recv, Integer from, Integer do_recv,
Integer bytes){ Integer xbytes,Integer rbytes){
Packet p; Packet p;
p.send_buf = xmit; p.send_buf = xmit;
p.recv_buf = rcv; p.recv_buf = rcv;
@ -541,18 +625,25 @@ public:
p.from_rank= from; p.from_rank= from;
p.do_send = do_send; p.do_send = do_send;
p.do_recv = do_recv; p.do_recv = do_recv;
p.bytes = bytes; p.xbytes = xbytes;
p.rbytes = rbytes;
// if (do_send) std::cout << GridLogMessage << " MPI packet to "<<to<< " of size "<<xbytes<<std::endl;
// if (do_recv) std::cout << GridLogMessage << " MPI packet from "<<from<< " of size "<<xbytes<<std::endl;
Packets.push_back(p); Packets.push_back(p);
} }
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) { void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
Decompress d; Decompress d;
d.partial = this->partialDirichlet;
d.dims = _grid->_fdimensions;
d.kernel_p = k_p; d.kernel_p = k_p;
d.mpi_p = m_p; d.mpi_p = m_p;
d.buffer_size = buffer_size; d.buffer_size = buffer_size;
dv.push_back(d); dv.push_back(d);
} }
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m; Merge m;
m.partial = this->partialDirichlet;
m.dims = _grid->_fdimensions;
m.type = type; m.type = type;
m.mpointer = merge_p; m.mpointer = merge_p;
m.vpointers= rpointers; m.vpointers= rpointers;
@ -564,30 +655,21 @@ public:
CommsMerge(decompress,Mergers,Decompressions); CommsMerge(decompress,Mergers,Decompressions);
} }
template<class decompressor> void CommsMergeSHM(decompressor decompress) { template<class decompressor> void CommsMergeSHM(decompressor decompress) {
_grid->StencilBarrier();// Synch shared memory on a single nodes assert(MergersSHM.size()==0);
CommsMerge(decompress,MergersSHM,DecompressionsSHM); assert(DecompressionsSHM.size()==0);
} }
template<class decompressor> template<class decompressor>
void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd) void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd)
{ {
for(int i=0;i<mm.size();i++){ for(int i=0;i<mm.size();i++){
auto mp = &mm[i].mpointer[0]; decompressor::MergeFace(decompress,mm[i]);
auto vp0= &mm[i].vpointers[0][0];
auto vp1= &mm[i].vpointers[1][0];
auto type= mm[i].type;
accelerator_forNB(o,mm[i].buffer_size/2,vobj::Nsimd(),{
decompress.Exchange(mp,vp0,vp1,type,o);
});
} }
if ( mm.size() ) acceleratorFenceComputeStream();
for(int i=0;i<dd.size();i++){ for(int i=0;i<dd.size();i++){
auto kp = dd[i].kernel_p; decompressor::DecompressFace(decompress,dd[i]);
auto mp = dd[i].mpi_p;
accelerator_forNB(o,dd[i].buffer_size,1,{
decompress.Decompress(kp,mp,o);
});
} }
if ( dd.size() ) acceleratorFenceComputeStream();
} }
//////////////////////////////////////// ////////////////////////////////////////
// Set up routines // Set up routines
@ -645,6 +727,8 @@ public:
int block = dirichlet_block[dimension]; int block = dirichlet_block[dimension];
this->_comms_send[ii] = comm_dim; this->_comms_send[ii] = comm_dim;
this->_comms_recv[ii] = comm_dim; this->_comms_recv[ii] = comm_dim;
this->_comms_partial_send[ii] = 0;
this->_comms_partial_recv[ii] = 0;
if ( block && comm_dim ) { if ( block && comm_dim ) {
assert(abs(displacement) < ld ); assert(abs(displacement) < ld );
// Quiesce communication across block boundaries // Quiesce communication across block boundaries
@ -665,6 +749,10 @@ public:
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
} }
if ( partialDirichlet ) {
this->_comms_partial_send[ii] = !this->_comms_send[ii];
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
}
} }
} }
} }
@ -673,7 +761,7 @@ public:
int checkerboard, int checkerboard,
const std::vector<int> &directions, const std::vector<int> &directions,
const std::vector<int> &distances, const std::vector<int> &distances,
Parameters p) Parameters p=Parameters())
{ {
face_table_computed=0; face_table_computed=0;
_grid = grid; _grid = grid;
@ -692,8 +780,12 @@ public:
this->same_node.resize(npoints); this->same_node.resize(npoints);
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up DirichletBlock(p.dirichlet); // comms send/recv set up
fullDirichlet=0;
for(int d=0;d<p.dirichlet.size();d++){
if (p.dirichlet[d]) fullDirichlet=1;
}
_unified_buffer_size=0; _unified_buffer_size=0;
surface_list.resize(0); surface_list.resize(0);
@ -828,7 +920,7 @@ public:
GridBase *grid=_grid; GridBase *grid=_grid;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
int comms_recv = this->_comms_recv[point]; int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension]; int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@ -1014,11 +1106,12 @@ public:
int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point) int Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point)
{ {
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
int comms_send = this->_comms_send[point] ;
int comms_recv = this->_comms_recv[point] ;
int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ;
assert(rhs.Grid()==_grid); assert(rhs.Grid()==_grid);
// conformable(_grid,rhs.Grid()); // conformable(_grid,rhs.Grid());
@ -1048,7 +1141,17 @@ public:
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
int bytes = words * compress.CommDatumSize(); int bytes = words * compress.CommDatumSize();
int xbytes;
int rbytes;
if ( comms_send ) xbytes = bytes; // Full send
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; // full dirichlet
if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0;
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int comm_off = u_comm_offset; int comm_off = u_comm_offset;
@ -1061,49 +1164,72 @@ public:
assert (xmit_to_rank != _grid->ThisRank()); assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank());
if( comms_send ) { if ( !face_table_computed ) {
face_table.resize(face_idx+1);
if ( !face_table_computed ) { std::vector<std::pair<int,int> > face_table_host ;
face_table.resize(face_idx+1); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
std::vector<std::pair<int,int> > face_table_host ; // std::cout << "bytes expect "<< bytes << " " << face_table_host.size()* compress.CommDatumSize()<<std::endl;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
face_table[face_idx].resize(face_table_host.size()); face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0], acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0], &face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0])); face_table[face_idx].size()*sizeof(face_table_host[0]));
}
if ( compress.DecompressionStep() ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
send_buf = this->u_send_buf_p; // Gather locally, must send
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
assert(send_buf!=NULL);
Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,comm_off,so);
} }
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,bytes,cbmask);
if ( (!duplicate) ) { // Force comms for now
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
}
// potential SHM fast path for intranode
int shm_send=0;
int shm_recv=0;
#ifdef SHM_FAST_PATH
// Put directly in place if we can
send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf);
if ( (send_buf==NULL) ) {
shm_send=0;
send_buf = this->u_send_buf_p;
} else {
shm_send=1;
}
void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf);
if ( test_ptr != NULL ) shm_recv = 1;
// static int printed;
// if (!printed){
// std::cout << " GATHER FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
// printed = 1;
// }
#else
////////////////////////////////////////////////////////
// Gather locally
////////////////////////////////////////////////////////
send_buf = this->u_send_buf_p; // Gather locally, must send
assert(send_buf!=NULL);
#endif
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
if ( !duplicate ) { // Force comms for now
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// Build a list of things to do after we synchronise GPUs // Build a list of things to do after we synchronise GPUs
// Start comms now??? // Start comms now???
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
AddPacket((void *)&send_buf[comm_off], AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off], (void *)&recv_buf[comm_off],
xmit_to_rank, comms_send, xmit_to_rank, do_send,
recv_from_rank, comms_recv, recv_from_rank, do_recv,
bytes); xbytes,rbytes);
} }
if ( compress.DecompressionStep() && comms_recv ) { if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
AddDecompress(&this->u_recv_buf_p[comm_off], AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[comm_off], &recv_buf[comm_off],
words,Decompressions); words,Decompressions);
@ -1111,7 +1237,6 @@ public:
u_comm_offset+=words; u_comm_offset+=words;
face_idx++; face_idx++;
} }
} }
return 0; return 0;
@ -1124,8 +1249,10 @@ public:
const int maxl =2;// max layout in a direction const int maxl =2;// max layout in a direction
int comms_send = this->_comms_send[point] ; int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point] ; int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@ -1155,10 +1282,15 @@ public:
int datum_bytes = compress.CommDatumSize(); int datum_bytes = compress.CommDatumSize();
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
// how many bytes on wire : partial dirichlet or dirichlet may set to < bytes
int xbytes;
int rbytes;
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
Vector<cobj *> rpointers(maxl); std::vector<cobj *> rpointers(maxl);
Vector<cobj *> spointers(maxl); std::vector<cobj *> spointers(maxl);
/////////////////////////////////////////// ///////////////////////////////////////////
// Work out what to send where // Work out what to send where
@ -1184,22 +1316,37 @@ public:
if ( !face_table_computed ) { if ( !face_table_computed ) {
face_table.resize(face_idx+1); face_table.resize(face_idx+1);
std::vector<std::pair<int,int> > face_table_host ; std::vector<std::pair<int,int> > face_table_host ;
Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host); Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,comm_off,face_table_host);
face_table[face_idx].resize(face_table_host.size()); face_table[face_idx].resize(face_table_host.size());
acceleratorCopyToDevice(&face_table_host[0], acceleratorCopyToDevice(&face_table_host[0],
&face_table[face_idx][0], &face_table[face_idx][0],
face_table[face_idx].size()*sizeof(face_table_host[0])); face_table[face_idx].size()*sizeof(face_table_host[0]));
} }
if ( comms_send || comms_recv ) {
Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); if ( comms_send ) xbytes = bytes;
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0;
if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0;
// Gathers SIMD lanes for send and merge
// Different faces can be full comms or partial comms with multiple ranks per node
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
int partial = partialDirichlet;
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
spointers,dimension,sx,cbmask,
compress,permute_type,partial );
} }
face_idx++; face_idx++;
//spointers[0] -- low //spointers[0] -- low simd coor
//spointers[1] -- high //spointers[1] -- high simd coor
for(int i=0;i<maxl;i++){ for(int i=0;i<maxl;i++){
int my_coor = rd*i + x; // self explanatory int my_coor = rd*i + x; // self explanatory
@ -1220,17 +1367,48 @@ public:
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
int shm_send=0;
int shm_recv=0;
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
#ifdef SHM_FAST_PATH
#warning STENCIL SHM FAST PATH SELECTED
// shm == receive pointer if offnode
// shm == Translate[send pointer] if on node -- my view of his send pointer
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
if (shm==NULL) {
shm = rp;
// we found a packet that comes from MPI and contributes to this shift.
// is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil.
// Kernel will add the exterior_terms except if is_same_node.
// leg of stencil
shm_recv=0;
} else {
shm_recv=1;
}
rpointers[i] = shm;
// Test send side
void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp);
if ( test_ptr != NULL ) shm_send = 1;
// static int printed;
// if (!printed){
// std::cout << " GATHERSIMD FAST PATH SHM "<<shm_send<< " "<<shm_recv<<std::endl;
// printed = 1;
// }
#else
rpointers[i] = rp; rpointers[i] = rp;
#endif
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,bytes,cbmask);
int duplicate = CheckForDuplicate(dimension,sx,nbr_proc,(void *)rp,i,xbytes,rbytes,cbmask);
if ( !duplicate ) { if ( !duplicate ) {
if ( (bytes != rbytes) && (rbytes!=0) ){
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
}
int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
AddPacket((void *)sp,(void *)rp, AddPacket((void *)sp,(void *)rp,
xmit_to_rank,comms_send, xmit_to_rank,do_send,
recv_from_rank,comms_recv, recv_from_rank,do_send,
bytes); xbytes,rbytes);
} }
} else { } else {
@ -1239,8 +1417,8 @@ public:
} }
} }
// rpointer may be doing a remote read in the gather over SHM
if ( comms_recv ) { if ( comms_recv|comms_partial_recv ) {
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
} }

View File

@ -31,6 +31,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////
// Inside a GPU thread
////////////////////////////////////////////////
template<class vobj>
accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type)
{
typedef decltype(coalescedRead(mp0)) sobj;
unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one
const vobj *vpa = &vp0;
const vobj *vpb = &vp1;
const vobj *vp = (lane&mask) ? (vpb) : (vpa);
auto sa = coalescedRead(vp[0],j0);
auto sb = coalescedRead(vp[0],j1);
coalescedWrite(mp0,sa);
coalescedWrite(mp1,sb);
}
#ifndef GRID_SIMT #ifndef GRID_SIMT
////////////////////////////////////////// //////////////////////////////////////////

View File

@ -178,6 +178,7 @@ public:
stream << "S {" << o._internal << "}"; stream << "S {" << o._internal << "}";
return stream; return stream;
}; };
// FIXME These will break with change of data layout
strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(&_internal); } strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(&_internal); }
strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(&_internal); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(&_internal); }
strong_inline const scalar_type * end() const { return begin() + Traits::count; } strong_inline const scalar_type * end() const { return begin() + Traits::count; }
@ -288,6 +289,7 @@ public:
// return _internal[i]; // return _internal[i];
// } // }
// FIXME These will break with change of data layout
strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal); } strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal); }
strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal); }
strong_inline const scalar_type * end() const { return begin() + Traits::count; } strong_inline const scalar_type * end() const { return begin() + Traits::count; }
@ -430,6 +432,7 @@ public:
// return _internal[i][j]; // return _internal[i][j];
// } // }
// FIXME These will break with change of data layout
strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal[0]); } strong_inline const scalar_type * begin() const { return reinterpret_cast<const scalar_type *>(_internal[0]); }
strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal[0]); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal[0]); }
strong_inline const scalar_type * end() const { return begin() + Traits::count; } strong_inline const scalar_type * end() const { return begin() + Traits::count; }

View File

@ -1,5 +1,5 @@
/************************************************************************************* /*************************************************************************************
n
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/tensors/Tensor_extract_merge.h Source file: ./lib/tensors/Tensor_extract_merge.h
@ -62,8 +62,18 @@ void extract(const vobj &vec,ExtractBuffer<sobj> &extracted)
const int words=sizeof(vobj)/sizeof(vector_type); const int words=sizeof(vobj)/sizeof(vector_type);
const int Nsimd=vector_type::Nsimd(); const int Nsimd=vector_type::Nsimd();
const int Nextr=extracted.size(); const int Nextr=extracted.size();
vector_type * vp = (vector_type *)&vec;
const int s=Nsimd/Nextr; const int s=Nsimd/Nextr;
sobj_scalar_type *sp = (sobj_scalar_type *) &extracted[0]; sobj_scalar_type *sp = (sobj_scalar_type *) &extracted[0];
sobj_scalar_type stmp;
for(int w=0;w<words;w++){
for(int i=0;i<Nextr;i++){
stmp = vp[w].getlane(i*s);
sp[i*words+w] =stmp;
// memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp));
}
}
/*
scalar_type *vp = (scalar_type *)&vec; scalar_type *vp = (scalar_type *)&vec;
scalar_type vtmp; scalar_type vtmp;
sobj_scalar_type stmp; sobj_scalar_type stmp;
@ -74,6 +84,8 @@ void extract(const vobj &vec,ExtractBuffer<sobj> &extracted)
memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp)); memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp));
} }
} }
*/
return; return;
} }
@ -93,7 +105,7 @@ void merge(vobj &vec,ExtractBuffer<sobj> &extracted)
const int s=Nsimd/Nextr; const int s=Nsimd/Nextr;
sobj_scalar_type *sp = (sobj_scalar_type *)&extracted[0]; sobj_scalar_type *sp = (sobj_scalar_type *)&extracted[0];
scalar_type *vp = (scalar_type *)&vec; vector_type *vp = (vector_type *)&vec;
scalar_type vtmp; scalar_type vtmp;
sobj_scalar_type stmp; sobj_scalar_type stmp;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@ -101,7 +113,8 @@ void merge(vobj &vec,ExtractBuffer<sobj> &extracted)
for(int ii=0;ii<s;ii++){ for(int ii=0;ii<s;ii++){
memcpy((char *)&stmp,(char *)&sp[i*words+w],sizeof(stmp)); memcpy((char *)&stmp,(char *)&sp[i*words+w],sizeof(stmp));
vtmp = stmp; vtmp = stmp;
memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp)); vp[w].putlane(vtmp,i*s+ii);
// memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
} }
} }
} }
@ -117,16 +130,16 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename ExtractTypeMap<scalar_type>::extract_type extract_type; typedef typename ExtractTypeMap<scalar_type>::extract_type extract_type;
typedef extract_type * pointer; typedef scalar_type * pointer;
constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd(); constexpr int Nsimd=vector_type::Nsimd();
scalar_object extracted; scalar_object extracted;
pointer __restrict__ sp = (pointer)&extracted; // Type pun pointer __restrict__ sp = (pointer)&extracted; // Type pun
pointer __restrict__ vp = (pointer)&vec; vector_type *vp = (vector_type *)&vec;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
sp[w]=vp[w*Nsimd+lane]; sp[w]=vp[w].getlane(lane);
} }
return extracted; return extracted;
} }
@ -137,15 +150,15 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vector_type::scalar_type scalar_type; typedef typename vector_type::scalar_type scalar_type;
typedef typename ExtractTypeMap<scalar_type>::extract_type extract_type; typedef typename ExtractTypeMap<scalar_type>::extract_type extract_type;
typedef extract_type * pointer; typedef scalar_type * pointer;
constexpr int words=sizeof(vobj)/sizeof(vector_type); constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd(); constexpr int Nsimd=vector_type::Nsimd();
pointer __restrict__ sp = (pointer)&extracted; pointer __restrict__ sp = (pointer)&extracted;
pointer __restrict__ vp = (pointer)&vec; vector_type *vp = (vector_type *)&vec;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[w*Nsimd+lane]=sp[w]; vp[w].putlane(sp[w],lane);
} }
} }
@ -164,15 +177,13 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
const int Nextr=extracted.size(); const int Nextr=extracted.size();
const int s = Nsimd/Nextr; const int s = Nsimd/Nextr;
scalar_type * vp = (scalar_type *)&vec; vector_type * vp = (vector_type *)&vec;
scalar_type vtmp; scalar_type vtmp;
sobj_scalar_type stmp; sobj_scalar_type stmp;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
for(int i=0;i<Nextr;i++){ for(int i=0;i<Nextr;i++){
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset]; sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
memcpy((char *)&vtmp,(char *)&vp[w*Nsimd+i*s],sizeof(vtmp)); pointer[w] = vp[w].getlane(i*s);
stmp = vtmp;
memcpy((char *)&pointer[w],(char *)&stmp,sizeof(stmp)); // may do a precision conversion
} }
} }
} }
@ -192,23 +203,21 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
const int Nextr=extracted.size(); const int Nextr=extracted.size();
const int s = Nsimd/Nextr; const int s = Nsimd/Nextr;
scalar_type * vp = (scalar_type *)&vec; vector_type * vp = (vector_type *)&vec;
scalar_type vtmp; scalar_type vtmp;
sobj_scalar_type stmp; sobj_scalar_type stmp;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
for(int i=0;i<Nextr;i++){ for(int i=0;i<Nextr;i++){
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset]; sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
for(int ii=0;ii<s;ii++){ for(int ii=0;ii<s;ii++){
memcpy((char *)&stmp,(char *)&pointer[w],sizeof(stmp)); vtmp=pointer[w];
vtmp=stmp; vp[w].putlane(vtmp,i*s+ii);
memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
} }
} }
} }
} }
////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////
//Copy a single lane of a SIMD tensor type from one object to another //Copy a single lane of a SIMD tensor type from one object to another
//Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type) //Output object must be of the same tensor type but may be of a different precision (i.e. it can have a different root data type)
@ -217,7 +226,7 @@ template<class vobjOut, class vobjIn>
accelerator_inline accelerator_inline
void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in) void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
{ {
static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same static_assert( std::is_same<typename vobjOut::scalar_typeD, typename vobjIn::scalar_typeD>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
typedef typename vobjOut::vector_type ovector_type; typedef typename vobjOut::vector_type ovector_type;
typedef typename vobjIn::vector_type ivector_type; typedef typename vobjIn::vector_type ivector_type;
@ -239,12 +248,12 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
iscalar_type itmp; iscalar_type itmp;
oscalar_type otmp; oscalar_type otmp;
opointer __restrict__ op = (opointer)&vecOut; ovector_type * __restrict__ op = (ovector_type *)&vecOut;
ipointer __restrict__ ip = (ipointer)&vecIn; ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
for(int w=0;w<owords;w++){ for(int w=0;w<owords;w++){
memcpy( (char*)&itmp, (char*)(ip + lane_in + iNsimd*w), sizeof(iscalar_type) ); itmp = ip[w].getlane(lane_in);
otmp = itmp; //potential precision change otmp = itmp; //potential precision change
memcpy( (char*)(op + lane_out + oNsimd*w), (char*)&otmp, sizeof(oscalar_type) ); op[w].putlane(otmp,lane_out);
} }
} }

View File

@ -214,24 +214,20 @@ accelerator_inline vRealD innerProductD2(const vRealD &l,const vRealD &
accelerator_inline vComplexD2 innerProductD2(const vComplexF &l,const vComplexF &r) accelerator_inline vComplexD2 innerProductD2(const vComplexF &l,const vComplexF &r)
{ {
vComplexD la,lb; vComplexD2 dl,dr;
vComplexD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
vComplexD2 ret; vComplexD2 ret;
ret._internal[0] = innerProduct(la,ra); precisionChange(dl,l);
ret._internal[1] = innerProduct(lb,rb); precisionChange(dr,r);
ret = innerProduct(dl,dr);
return ret; return ret;
} }
accelerator_inline vRealD2 innerProductD2(const vRealF &l,const vRealF &r) accelerator_inline vRealD2 innerProductD2(const vRealF &l,const vRealF &r)
{ {
vRealD la,lb; vRealD2 dl,dr;
vRealD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
vRealD2 ret; vRealD2 ret;
ret._internal[0]=innerProduct(la,ra); precisionChange(dl,l);
ret._internal[1]=innerProduct(lb,rb); precisionChange(dr,r);
ret=innerProduct(dl,dr);
return ret; return ret;
} }

View File

@ -42,39 +42,6 @@ NAMESPACE_BEGIN(Grid);
template<typename T> struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; }; template<typename T> struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; };
template<class T> struct isGridScalar<iScalar<T>> : public std::true_type { static constexpr bool notvalue = false; }; template<class T> struct isGridScalar<iScalar<T>> : public std::true_type { static constexpr bool notvalue = false; };
// Store double-precision data in single-precision grids for precision promoted localInnerProductD
template<typename T>
class TypePair {
public:
T _internal[2];
accelerator TypePair<T>& operator=(const Grid::Zero& o) {
_internal[0] = Zero();
_internal[1] = Zero();
return *this;
}
accelerator TypePair<T> operator+(const TypePair<T>& o) const {
TypePair<T> r;
r._internal[0] = _internal[0] + o._internal[0];
r._internal[1] = _internal[1] + o._internal[1];
return r;
}
accelerator TypePair<T>& operator+=(const TypePair<T>& o) {
_internal[0] += o._internal[0];
_internal[1] += o._internal[1];
return *this;
}
friend accelerator_inline void add(TypePair<T>* ret, const TypePair<T>* a, const TypePair<T>* b) {
add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]);
add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]);
}
};
typedef TypePair<ComplexD> ComplexD2;
typedef TypePair<RealD> RealD2;
typedef TypePair<vComplexD> vComplexD2;
typedef TypePair<vRealD> vRealD2;
// Traits to identify fundamental data types // Traits to identify fundamental data types
template<typename T> struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; }; template<typename T> struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; };
@ -88,8 +55,6 @@ NAMESPACE_BEGIN(Grid);
template<> struct isGridFundamental<RealD> : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental<RealD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vComplexD2> : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental<vComplexD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealD2> : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental<vRealD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealD2> : public std::true_type { static constexpr bool notvalue = false; };
////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////
@ -136,7 +101,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexF Complexified; typedef ComplexF Complexified;
typedef RealF Realified; typedef RealF Realified;
typedef RealD DoublePrecision; typedef RealD DoublePrecision;
typedef RealD2 DoublePrecision2; typedef RealD DoublePrecision2;
}; };
template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
typedef RealD scalar_type; typedef RealD scalar_type;
@ -151,19 +116,6 @@ NAMESPACE_BEGIN(Grid);
typedef RealD DoublePrecision; typedef RealD DoublePrecision;
typedef RealD DoublePrecision2; typedef RealD DoublePrecision2;
}; };
template<> struct GridTypeMapper<RealD2> : public GridTypeMapper_Base {
typedef RealD2 scalar_type;
typedef RealD2 scalar_typeD;
typedef RealD2 vector_type;
typedef RealD2 vector_typeD;
typedef RealD2 tensor_reduced;
typedef RealD2 scalar_object;
typedef RealD2 scalar_objectD;
typedef ComplexD2 Complexified;
typedef RealD2 Realified;
typedef RealD2 DoublePrecision;
typedef RealD2 DoublePrecision2;
};
template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
typedef ComplexF scalar_type; typedef ComplexF scalar_type;
typedef ComplexD scalar_typeD; typedef ComplexD scalar_typeD;
@ -175,7 +127,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexF Complexified; typedef ComplexF Complexified;
typedef RealF Realified; typedef RealF Realified;
typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision;
typedef ComplexD2 DoublePrecision2; typedef ComplexD DoublePrecision2;
}; };
template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
typedef ComplexD scalar_type; typedef ComplexD scalar_type;
@ -191,7 +143,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexD DoublePrecision2; typedef ComplexD DoublePrecision2;
}; };
#ifdef GRID_CUDA #if defined(GRID_CUDA) || defined(GRID_HIP)
template<> struct GridTypeMapper<std::complex<float> > : public GridTypeMapper_Base { template<> struct GridTypeMapper<std::complex<float> > : public GridTypeMapper_Base {
typedef std::complex<float> scalar_type; typedef std::complex<float> scalar_type;
typedef std::complex<double> scalar_typeD; typedef std::complex<double> scalar_typeD;
@ -220,19 +172,6 @@ NAMESPACE_BEGIN(Grid);
}; };
#endif #endif
template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type;
typedef ComplexD2 scalar_typeD;
typedef ComplexD2 vector_type;
typedef ComplexD2 vector_typeD;
typedef ComplexD2 tensor_reduced;
typedef ComplexD2 scalar_object;
typedef ComplexD2 scalar_objectD;
typedef ComplexD2 Complexified;
typedef RealD2 Realified;
typedef ComplexD2 DoublePrecision;
typedef ComplexD2 DoublePrecision2;
};
template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base { template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
typedef Integer scalar_type; typedef Integer scalar_type;
typedef Integer scalar_typeD; typedef Integer scalar_typeD;
@ -274,13 +213,13 @@ NAMESPACE_BEGIN(Grid);
typedef vRealD DoublePrecision2; typedef vRealD DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealD2> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealD2> : public GridTypeMapper_Base {
typedef RealD2 scalar_type; typedef RealD scalar_type;
typedef RealD2 scalar_typeD; typedef RealD scalar_typeD;
typedef vRealD2 vector_type; typedef vRealD2 vector_type;
typedef vRealD2 vector_typeD; typedef vRealD2 vector_typeD;
typedef vRealD2 tensor_reduced; typedef vRealD2 tensor_reduced;
typedef RealD2 scalar_object; typedef RealD scalar_object;
typedef RealD2 scalar_objectD; typedef RealD scalar_objectD;
typedef vComplexD2 Complexified; typedef vComplexD2 Complexified;
typedef vRealD2 Realified; typedef vRealD2 Realified;
typedef vRealD2 DoublePrecision; typedef vRealD2 DoublePrecision;
@ -341,13 +280,13 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexD DoublePrecision2; typedef vComplexD DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexD2> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type; typedef ComplexD scalar_type;
typedef ComplexD2 scalar_typeD; typedef ComplexD scalar_typeD;
typedef vComplexD2 vector_type; typedef vComplexD2 vector_type;
typedef vComplexD2 vector_typeD; typedef vComplexD2 vector_typeD;
typedef vComplexD2 tensor_reduced; typedef vComplexD2 tensor_reduced;
typedef ComplexD2 scalar_object; typedef ComplexD scalar_object;
typedef ComplexD2 scalar_objectD; typedef ComplexD scalar_objectD;
typedef vComplexD2 Complexified; typedef vComplexD2 Complexified;
typedef vRealD2 Realified; typedef vRealD2 Realified;
typedef vComplexD2 DoublePrecision; typedef vComplexD2 DoublePrecision;

View File

@ -201,12 +201,15 @@ void acceleratorInit(void)
#ifdef GRID_SYCL #ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator; cl::sycl::queue *theGridAccelerator;
cl::sycl::queue *theCopyAccelerator;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
cl::sycl::gpu_selector selector; cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector }; cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice); theGridAccelerator = new sycl::queue (selectedDevice);
// theCopyAccelerator = new sycl::queue (selectedDevice);
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
zeInit(0); zeInit(0);

View File

@ -250,19 +250,25 @@ inline int acceleratorIsCommunicable(void *ptr)
////////////////////////////////////////////// //////////////////////////////////////////////
#ifdef GRID_SYCL #ifdef GRID_SYCL
NAMESPACE_END(Grid);
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#ifdef GRID_SYCL_LEVEL_ZERO_IPC NAMESPACE_END(Grid);
#if 0
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
#include <level_zero/ze_api.h> #include <level_zero/ze_api.h>
#include <CL/sycl/backend/level_zero.hpp> #include <CL/sycl/backend/level_zero.hpp>
#else
#include <sycl/CL/sycl.hpp>
#include <sycl/usm.hpp>
#include <level_zero/ze_api.h>
#include <sycl/ext/oneapi/backend/level_zero.hpp>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern cl::sycl::queue *theGridAccelerator; extern cl::sycl::queue *theGridAccelerator;
extern cl::sycl::queue *theCopyAccelerator;
#ifdef __SYCL_DEVICE_ONLY__ #ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT #define GRID_SIMT
@ -290,7 +296,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
cgh.parallel_for( \ cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \ cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(8)]] \ [[intel::reqd_sub_group_size(16)]] \
{ \ { \
auto iter1 = item.get_global_id(0); \ auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \ auto iter2 = item.get_global_id(1); \
@ -299,19 +305,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
}); \ }); \
}); });
#define accelerator_barrier(dummy) theGridAccelerator->wait(); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) {
theGridAccelerator->memcpy(to,from,bytes); inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr) inline int acceleratorIsCommunicable(void *ptr)
{ {
#if 0 #if 0
@ -514,7 +520,16 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,
inline void acceleratorFreeCpu (void *ptr){free(ptr);}; inline void acceleratorFreeCpu (void *ptr){free(ptr);};
#endif #endif
//////////////////////////////////////////////
// Fencing needed ONLY for SYCL
//////////////////////////////////////////////
#ifdef GRID_SYCL
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
#else
// Ordering within a stream guaranteed on Nvidia & AMD
inline void acceleratorFenceComputeStream(void){ };
#endif
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch // Synchronise across local threads for divergence resynch

View File

@ -167,14 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
return; return;
} }
void GridCmdOptionFloat(std::string &str,float & val) void GridCmdOptionFloat(std::string &str,double & val)
{ {
std::stringstream ss(str); std::stringstream ss(str);
ss>>val; ss>>val;
return; return;
} }
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,
Coordinate &latt_c, Coordinate &latt_c,
Coordinate &mpi_c) Coordinate &mpi_c)

View File

@ -57,7 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
template<class VectorInt> template<class VectorInt>
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
void GridCmdOptionInt(std::string &str,int & val); void GridCmdOptionInt(std::string &str,int & val);
void GridCmdOptionFloat(std::string &str,float & val); void GridCmdOptionFloat(std::string &str,double & val);
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,

View File

@ -27,6 +27,7 @@
/* END LEGAL */ /* END LEGAL */
extern "C" { extern "C" {
#include <openssl/sha.h> #include <openssl/sha.h>
#include <openssl/evp.h>
} }
#ifdef USE_IPP #ifdef USE_IPP
#include "ipp.h" #include "ipp.h"
@ -70,10 +71,8 @@ public:
static inline std::vector<unsigned char> sha256(const void *data,size_t bytes) static inline std::vector<unsigned char> sha256(const void *data,size_t bytes)
{ {
std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH); std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH);
SHA256_CTX sha256; auto digest = EVP_get_digestbyname("SHA256");
SHA256_Init (&sha256); EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL);
SHA256_Update(&sha256, data,bytes);
SHA256_Final (&hash[0], &sha256);
return hash; return hash;
} }
static inline std::vector<int> sha256_seeds(const std::string &s) static inline std::vector<int> sha256_seeds(const std::string &s)

View File

@ -39,7 +39,7 @@ int main(int argc, char **argv) {
// Typedefs to simplify notation // Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy; typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionR FermionAction; typedef MobiusFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField; typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser; typedef Grid::XmlReader Serialiser;
@ -133,8 +133,8 @@ int main(int argc, char **argv) {
//////////////////////////////////// ////////////////////////////////////
// FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params); // FermionAction StrangeOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_mass,M5,b,c, Params);
// DomainWallEOFAFermionR Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5); // DomainWallEOFAFermionD Strange_Op_L(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mb, shift_L, pm, M5);
// DomainWallEOFAFermionR Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5); // DomainWallEOFAFermionD Strange_Op_R(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, shift_R, pm, M5);
// ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false); // ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L,Strange_Op_R,CG,ofp, false);
FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);

View File

@ -175,9 +175,9 @@ int main(int argc, char **argv) {
// Typedefs to simplify notation // Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy; typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionR FermionAction; typedef MobiusFermionD FermionAction;
typedef MobiusFermionF FermionActionF; typedef MobiusFermionF FermionActionF;
typedef MobiusEOFAFermionR FermionEOFAAction; typedef MobiusEOFAFermionD FermionEOFAAction;
typedef MobiusEOFAFermionF FermionEOFAActionF; typedef MobiusEOFAFermionF FermionEOFAActionF;
typedef typename FermionAction::FermionField FermionField; typedef typename FermionAction::FermionField FermionField;
typedef typename FermionActionF::FermionField FermionFieldF; typedef typename FermionActionF::FermionField FermionFieldF;
@ -293,9 +293,9 @@ int main(int argc, char **argv) {
OFRp.precision= 50; OFRp.precision= 50;
MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ConjugateGradient<FermionField> ActionCG(ActionStoppingCondition,MaxCGIterations); ConjugateGradient<FermionField> ActionCG(ActionStoppingCondition,MaxCGIterations);

Some files were not shown because too many files have changed in this diff Show More