1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-22 00:44:45 +01:00

Compare commits

..

255 Commits

Author SHA1 Message Date
Peter Boyle
defcac92ab Somewhat better wrapped support for Icosahedral 2025-10-20 18:09:21 -04:00
Peter Boyle
4869378f1e Now computed some plaquettes.
First cut at stencil
2025-10-20 11:15:17 -04:00
Peter Boyle
c7b74db317 Default dimensions fixed 2025-10-09 14:57:22 -04:00
Peter Boyle
0ce201efbe IcosahedralVerted() checks 2025-10-09 13:35:16 -04:00
Peter Boyle
6d8a3d8bb2 Config 2025-10-09 13:30:16 -04:00
Peter Boyle
7dfd207ebb Need to protect pole operatoins to only take place on IcosahedralVertices mesh 2025-10-08 15:18:31 -04:00
Peter Boyle
3a65a096f2 Nd verbose 2025-10-07 18:49:00 -04:00
Peter Boyle
85b2bd4c93 Beginnings of S2xR 2025-10-07 16:11:06 -04:00
Peter Boyle
35e10a1159 Changes for Nd=3 2025-10-03 12:17:13 -04:00
d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
Peter Boyle
dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
Peter Boyle
5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
Peter Boyle
2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
Peter Boyle
cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
Peter Boyle
b517e88db3 Update README 2025-05-13 16:49:21 -04:00
bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
644cc6647e JSON update 2025-05-13 12:50:58 +00:00
72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
Peter Boyle
d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
Peter Boyle
bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
Peter Boyle
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
Peter Boyle
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
Peter Boyle
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
Peter Boyle
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
Peter Boyle
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
Peter Boyle
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
Peter Boyle
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
Peter Boyle
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
Peter Boyle
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
Peter Boyle
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
Peter Boyle
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
Peter Boyle
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
Peter Boyle
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
Chulwoo Jung
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
Chulwoo Jung
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
Christoph Lehner
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
Christoph Lehner
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
Peter Boyle
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
Peter Boyle
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
Peter Boyle
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
Peter Boyle
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
Peter Boyle
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
Peter Boyle
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
Peter Boyle
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
Peter Boyle
3811d19298 Fence 2025-04-04 18:35:06 -04:00
Peter Boyle
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
Peter Boyle
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
Peter Boyle
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
Peter Boyle
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
Peter Boyle
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
Peter Boyle
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
Peter Boyle
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
Peter Boyle
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
Mashy Green
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
Mashy Green
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
Peter Boyle
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
Christoph Lehner
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
Christoph Lehner
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
Christoph Lehner
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
Mashy Green
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
Mashy Green
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
Mashy Green
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
Christoph Lehner
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
Peter Boyle
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
Peter Boyle
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
Peter Boyle
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
Peter Boyle
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
Peter Boyle
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
Peter Boyle
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
Peter Boyle
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
Peter Boyle
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
Peter Boyle
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
Peter Boyle
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
Muhammad Asif
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
Muhammad Asif
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
Mashy Green
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
Mashy Green
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
Mashy Green
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
Mashy Green
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
Mashy Green
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
Mashy Green
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
Peter Boyle
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
Christoph Lehner
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
Mashy Green
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
Mashy Green
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
Chulwoo Jung
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
Chulwoo Jung
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
Peter Boyle
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
Chulwoo Jung
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
Chulwoo Jung
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
Mashy Green
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
Mashy Green
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
Mashy Green
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
Mashy Green
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
Peter Boyle
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
Peter Boyle
b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
Peter Boyle
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
Peter Boyle
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
Peter Boyle
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
Peter Boyle
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
Peter Boyle
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
Peter Boyle
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
Peter Boyle
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
Peter Boyle
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
Peter Boyle
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
Peter Boyle
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
Peter Boyle
5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
Peter Boyle
4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
Peter Boyle
955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
Peter Boyle
11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
Peter Boyle
8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
Peter Boyle
ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
Peter Boyle
6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
Peter Boyle
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
Peter Boyle
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
Peter Boyle
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
Peter Boyle
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
Peter Boyle
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
Peter Boyle
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
Peter Boyle
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
Peter Boyle
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
066544281f Deprecate UVM 2024-09-17 13:34:27 +00:00
11be10d2c0 Aurora testing 2024-09-10 18:11:52 +00:00
160969a758 UVM tester, doesn't turn up anything 2024-09-10 18:09:42 +00:00
622f78ebea SYCL updates -- operator = giving trouble on Aurora.
SYCL reduction is failing intermittently with SVM interface - returns
zero, expect non-zero.
Think I need to remove ALL dependence on SVM.
2024-09-04 13:53:48 +00:00
Peter Boyle
aa67a5b095 Rename 2024-08-27 19:54:01 +00:00
Peter Boyle
af9ea0864c Blas fix 2024-08-27 19:53:09 +00:00
Peter Boyle
4e2a6d87c4 Gemm batched fix 2024-08-27 19:24:05 +00:00
Peter Boyle
a465ecece9 Aurora 2024-08-27 19:20:43 +00:00
Peter Boyle
575eb72182 Converges on 16^3 2024-08-27 19:20:38 +00:00
Peter Boyle
3a973914d6 Compile on frontier 2024-08-27 14:55:42 -04:00
Peter Boyle
f568c07bbd Improved the BLAS benchmark 2024-08-27 14:53:54 -04:00
Peter Boyle
2c9878fc3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-27 12:05:46 -04:00
Peter Boyle
27b1b1b005 Checkerboard available for offloading pickCheckerboard 2024-08-27 12:04:09 -04:00
Peter Boyle
130d7ab077 Verbose changes 2024-08-27 12:03:28 -04:00
Peter Boyle
29f6b8a74a Setup 2024-08-27 12:02:49 -04:00
Peter Boyle
9779aaea33 16^3 optimise 2024-08-27 11:38:35 -04:00
Peter Boyle
ec25604a67 Fastest solver for mrhs multigrid 2024-08-27 11:32:34 -04:00
Peter Boyle
3668e81c5e Extract slice working on checkerboard field for Block Lanczos 2024-08-27 11:31:30 -04:00
Peter Boyle
d66b2423cb Move slice operations to GPU for BlockCG 2024-08-27 11:28:47 -04:00
Peter Boyle
15cc78f0b6 peek/poke local site on checkerboard arrays 2024-08-27 11:23:42 -04:00
Peter Boyle
06db4ddea2 Fast init on GPU 2024-08-27 11:22:33 -04:00
Peter Boyle
6cfb90e99f Support needed for accelerator resident set/pick Checkerboard 2024-08-27 11:19:00 -04:00
Peter Boyle
d8be95a2a3 Don't early terminate power method to get more accurate top EV 2024-08-27 11:17:37 -04:00
Peter Boyle
f82702872d Normal residual 2024-08-27 11:16:44 -04:00
Peter Boyle
3752c49ef0 Add option to record the CG polynomial 2024-08-27 11:14:35 -04:00
Peter Boyle
fe65fa4988 MulMatrix 2024-08-27 11:13:18 -04:00
Peter Boyle
1fe4c205a3 Adef 2024-08-27 11:11:47 -04:00
Peter Boyle
d4dc5e0f43 BlockCG linalg acceleratoin with BLAS 2024-08-27 11:08:33 -04:00
Peter Boyle
77944437ce Functor initialisation 2024-08-27 11:01:02 -04:00
Peter Boyle
c164bff758 MMdag 2024-08-27 11:00:36 -04:00
Peter Boyle
aa2e3d954a MMdag operator 2024-08-27 10:59:29 -04:00
Peter Boyle
de62b04728 Block CG linalg acceleration 2024-08-27 10:58:54 -04:00
Peter Boyle
d0bdb50f24 Analyse power spectrum 2024-08-27 10:58:19 -04:00
Peter Boyle
a8fecbc609 BlockCG linalg via BLAS 2024-08-21 16:08:16 -04:00
Peter Boyle
557fa483ff Blas benchmark committed stand alone 2024-08-20 16:18:43 +00:00
Peter Boyle
fc15d55df6 Mallinfo 2024-08-20 14:33:09 +00:00
Peter Boyle
53573d7d94 Better benchmark 2024-08-20 14:31:57 +00:00
Peter Boyle
bb3c177000 Better benchmarking 2024-08-20 14:31:41 +00:00
Peter Boyle
a3322b470f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-20 14:30:52 +00:00
Peter Boyle
f8f408e7a9 BLAS everywhere 2024-07-25 18:09:02 +00:00
Peter Boyle
baac1127d0 Later intel compiler happiness 2024-07-25 18:06:05 +00:00
Peter Boyle
6f1328160c Remove SVM use 2024-07-25 18:05:40 +00:00
Peter Boyle
04cf902791 Mallinfo and ASAN hooks 2024-07-25 18:04:56 +00:00
Peter Boyle
7a5b1c1a19 Try Catch convenience macro 2024-07-25 18:03:41 +00:00
Peter Boyle
18d2d7da4a Eigen implementation and SYCL implementation 2024-07-25 18:02:56 +00:00
Peter Boyle
b461184797 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-23 09:53:58 -04:00
Peter Boyle
4563b39305 New Frontier config 2024-07-23 09:53:08 -04:00
Peter Boyle
c9d5674d5b FInal for paper 2024-07-22 15:26:45 -04:00
Peter Boyle
486412635a 8^4 test for PETSc 2024-07-22 15:25:17 -04:00
Peter Boyle
8b23a1546a Force compile temporarily 2024-07-22 15:24:56 -04:00
Peter Boyle
a901e4e369 Regressed performance for paper 2024-07-22 15:24:04 -04:00
Peter Boyle
804d9367d4 Regressed performance 2024-07-22 15:23:25 -04:00
Peter Boyle
41d8adca95 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-11 15:38:45 +00:00
Peter Boyle
059e8e5bb0 New compile option 2024-07-11 15:37:30 +00:00
Peter Boyle
b3ee8ded96 Respect command line 2024-07-11 15:34:48 +00:00
Peter Boyle
cf3584ad15 Convenient to monitor memory across an HMC trajectory 2024-07-11 15:30:32 +00:00
Peter Boyle
a66973163f Device vector not UVM 2024-07-11 15:24:11 +00:00
Peter Boyle
4502a8c8a1 libc malloc heap info dump on Linux 2024-07-11 15:22:18 +00:00
Peter Boyle
9c902e4c2d Batched blas, but not working yet on OneAPI 2024-07-11 15:19:49 +00:00
Peter Boyle
f3eb36adcf Namespace addition 2024-07-11 15:19:19 +00:00
Peter Boyle
7c246606c1 Schur additional case 2024-07-10 22:04:32 +00:00
Peter Boyle
172c75029e Redblack additional case 2024-07-10 22:03:59 +00:00
Peter Boyle
6ae52da571 LLVM leak sanitizer 2024-07-08 15:59:18 +00:00
Peter Boyle
4ee9c68053 Updated compile environment 2024-07-08 15:57:57 +00:00
Peter Boyle
a15b4378a3 Sanitizer preservation of options 2024-07-08 15:57:45 +00:00
Peter Boyle
89fdd7f8dd AOT compilation 2024-07-05 17:47:56 +00:00
Peter Boyle
c328be24b7 Sanitizer compile options 2024-07-05 17:46:43 +00:00
Peter Boyle
a73dc6dbf4 Display linux heap info 2024-06-28 16:05:17 +00:00
Peter Boyle
eee2a2657f Try catch exception wrappers 2024-06-28 16:02:29 +00:00
Peter Boyle
12b8be7cb9 Best so far on 96^3 350 Evecs converged on 4^4 block 2024-06-18 16:31:37 -04:00
Peter Boyle
63c223ea5d Verbose 2024-06-18 03:22:01 +00:00
Peter Boyle
2877fb4a2c More verbose if alloc failure 2024-06-18 03:21:03 +00:00
Peter Boyle
d299c86633 Std::asin,acos 2024-06-11 16:41:23 -04:00
Peter Boyle
6ce52092e8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 15:16:58 -04:00
Peter Boyle
b5926c1d21 Broadcast time info 2024-06-11 15:16:25 -04:00
Peter Boyle
9563238e9b Force initial to identity 2024-06-11 17:51:58 +00:00
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
Peter Boyle
e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
Peter Boyle
ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
279 changed files with 16677 additions and 4203 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -0,0 +1,5 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -37,6 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/QCD.h> #include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h> #include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/spin/Pauli.h> // depends on Gparity
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore); NAMESPACE_CHECK(GridQCDCore);

View File

@@ -30,9 +30,14 @@ directory
#include <type_traits> #include <type_traits>
#include <cassert> #include <cassert>
#include <exception>
#define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) } #define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid) #define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" ); #define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@@ -50,6 +50,7 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h> #include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h> #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h> #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
NAMESPACE_CHECK(deflation); NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad); NAMESPACE_CHECK(ConjGrad);

View File

@@ -168,6 +168,7 @@ public:
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW #ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl;
assert(0); assert(0);
#else #else
conformable(result.Grid(),vgrid); conformable(result.Grid(),vgrid);
@@ -190,6 +191,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite); autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -213,6 +215,7 @@ public:
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else assert(0); else assert(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -226,6 +229,7 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd); Coordinate lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
@@ -247,6 +251,7 @@ public:
} }
} }
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords // Loop over orthog coords
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
@@ -269,6 +274,7 @@ public:
usec += timer.useconds(); usec += timer.useconds();
flops+= flops_call*NN; flops+= flops_call*NN;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result // writing out result
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(pgbuf_v,pgbuf,CpuRead);
@@ -285,6 +291,7 @@ public:
} }
result = result*div; result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif #endif

View File

@@ -103,6 +103,38 @@ public:
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
} }
}; };
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother // Construct herm op and shift it for mgrid smoother
@@ -245,6 +277,38 @@ public:
assert(0); assert(0);
} }
}; };
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several // Even Odd Schur decomp operators; there are several

View File

@@ -45,6 +45,11 @@ public:
M(in,tmp); M(in,tmp);
Mdag(tmp,out); Mdag(tmp,out);
} }
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;

View File

@@ -59,7 +59,7 @@ public:
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = diff*1.0e-9; RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1; delta*=1.02;
RealD f = approx(x); RealD f = approx(x);
out<< x<<" "<<f<<std::endl; out<< x<<" "<<f<<std::endl;
} }
@@ -131,6 +131,26 @@ public:
Coeffs[j] = s * 2.0/order; Coeffs[j] = s * 2.0/order;
} }
}; };
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){ void JacksonSmooth(void){
@@ -249,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in); axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;

View File

@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
typedef cublasHandle_t gridblasHandle_t; typedef cublasHandle_t gridblasHandle_t;
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t; typedef int32_t gridblasHandle_t;
@@ -89,9 +89,10 @@ public:
gridblasHandle = theGridAccelerator; gridblasHandle = theGridAccelerator;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
cl::sycl::cpu_selector selector; sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector }; sycl::device selectedDevice { selector };
gridblasHandle =new sycl::queue (selectedDevice); sycl::property_list q_prop{sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif #endif
gridblasInit=1; gridblasInit=1;
} }
@@ -207,6 +208,9 @@ public:
assert(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
int ldc = m; // m x b column major int ldc = m; // m x b column major
@@ -266,26 +270,169 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI int64_t m64=m;
#warning "oneMKL implementation not built " int64_t n64=n;
#endif int64_t k64=k;
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) int64_t lda64=lda;
// Need a default/reference implementation int64_t ldb64=ldb;
int sda = lda*k; int64_t ldc64=ldc;
int sdb = ldb*k; int64_t batchCount64=batchCount;
int sdc = ldc*n;
for (int p = 0; p < batchCount; ++p) { oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexD *) &alpha_p[0],
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexD *) &beta_p[0],
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#if 0
// This code was used to check the mat mul on Sunspot/OneMKL
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
std::vector<ComplexD> A(m*k); // pointer list to matrices
std::vector<ComplexD> B(k*n);
std::vector<ComplexD> C(m*n);
// int sda = lda*k;
// int sdb = ldb*k;
// int sdc = ldc*n;
std::cerr << " Checking the GEMM results "<<std::endl;
for (int p = 0; p < 1; ++p) {
ComplexD * Amk_p; // pointer list to matrices
ComplexD * Bkn_p; // pointer list to matrices
ComplexD * Cmn_p; // pointer list to matrices
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
std::cerr << " p " << p << " copied pointers "<<std::endl;
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
std::cerr << " p " << p << " copied matrices "<<std::endl;
std::cerr << " C[0] "<<C[0]<<std::endl;
std::cerr << " A[0] "<<A[0]<<std::endl;
std::cerr << " B[0] "<<B[0]<<std::endl;
std::cerr << " m "<<m<<std::endl;
std::cerr << " n "<<n<<std::endl;
std::cerr << " k "<<k<<std::endl;
for (int mm = 0; mm < m; ++mm) { for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) { for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0); ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk) for (int kk = 0; kk < k; ++kk) {
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; int idx_a, idx_b;
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; // int lda = m; // m x k column major
// int ldb = k; // k x n column major
// int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N) {
idx_a =kk + mm*lda;
} else {
idx_a =mm + kk*lda;
}
if(OpB!=GridBLAS_OP_N) {
idx_b =nn + kk*ldb;
} else {
idx_b =kk + nn*ldb;
}
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
ComplexD Ac = A[idx_a];
ComplexD Bc = B[idx_b];
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
c_mn += Ac*Bc;
}
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
} }
} }
} }
#endif #endif
// synchronise(); #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond(); RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount; RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@@ -306,6 +453,9 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
int ldc = m; // m x b column major int ldc = m; // m x b column major
@@ -366,25 +516,110 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI int64_t m64=m;
#warning "oneMKL implementation not built " int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k; // Need a default/reference implementation; use Eigen
int sdb = ldb*k; if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
int sdc = ldc*n; thread_for (p, batchCount, {
ComplexF alphaf(real(alpha),imag(alpha)); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
ComplexF betaf(real(beta),imag(beta)); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
// Need a default/reference implementation Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
for (int p = 0; p < batchCount; ++p) { if (std::abs(beta) != 0.0)
for (int mm = 0; mm < m; ++mm) { eCmn = beta * eCmn + alpha * eAmk * eBkn ;
for (int nn = 0; nn < n; ++nn) { else
ComplexF c_mn(0.0); eCmn = alpha * eAmk * eBkn ;
for (int kk = 0; kk < k; ++kk) });
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; thread_for (p, batchCount, {
} Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
} Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
} }
#endif #endif
RealD t1=usecond(); RealD t1=usecond();
@@ -408,6 +643,9 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
int ldc = m; // m x b column major int ldc = m; // m x b column major
@@ -467,23 +705,80 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI int64_t m64=m;
#warning "oneMKL implementation not built " int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k; // Need a default/reference implementation; use Eigen
int sdb = ldb*k; if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
int sdc = ldc*n; thread_for (p, batchCount, {
// Need a default/reference implementation Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
for (int p = 0; p < batchCount; ++p) { Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
for (int mm = 0; mm < m; ++mm) { Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
for (int nn = 0; nn < n; ++nn) { if (std::abs(beta) != 0.0)
RealD c_mn(0.0); eCmn = beta * eCmn + alpha * eAmk * eBkn ;
for (int kk = 0; kk < k; ++kk) else
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; eCmn = alpha * eAmk * eBkn ;
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; });
} } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
} thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
} }
#endif #endif
RealD t1=usecond(); RealD t1=usecond();
@@ -495,7 +790,6 @@ public:
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM // Double precision real GEMM
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA, void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB, GridBLASOperation_t OpB,
int m,int n, int k, int m,int n, int k,
@@ -508,6 +802,9 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
int ldc = m; // m x b column major int ldc = m; // m x b column major
@@ -568,39 +865,80 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
/*
int64_t m64=m; int64_t m64=m;
int64_t n64=n; int64_t n64=n;
int64_t k64=k; int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount; int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
onemkl::transpose::N, oneapi::mkl::transpose iOpA;
onemkl::transpose::N, oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64, &m64,&n64,&k64,
(double *) &alpha_p[0], (double *) &alpha_p[0],
(double **)&Amk[0], lda, (const double **)&Amk[0], (const int64_t *)&lda64,
(double **)&Bkn[0], ldb, (const double **)&Bkn[0], (const int64_t *)&ldb64,
(double *) &beta_p[0], (double *) &beta_p[0],
(double **)&Cmn[0], ldc, (double **)&Cmn[0], (const int64_t *)&ldc64,
1,&batchCount64); (int64_t)1,&batchCount64,std::vector<sycl::event>());
*/ synchronise();
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k; // Need a default/reference implementation; use Eigen
int sdb = ldb*k; if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
int sdc = ldc*n; thread_for (p, batchCount, {
// Need a default/reference implementation Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
for (int p = 0; p < batchCount; ++p) { Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
for (int mm = 0; mm < m; ++mm) { Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
for (int nn = 0; nn < n; ++nn) { if (std::abs(beta) != 0.0)
RealD c_mn(0.0); eCmn = beta * eCmn + alpha * eAmk * eBkn ;
for (int kk = 0; kk < k; ++kk) else
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; eCmn = alpha * eAmk * eBkn ;
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; });
} } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
} thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
} }
#endif #endif
RealD t1=usecond(); RealD t1=usecond();
@@ -608,120 +946,55 @@ public:
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
} }
template<class CComplex>
////////////////////////////////////////////////////////////////////////////////////////////////
// Strided case used by benchmark, but generally unused in Grid
// Keep a code example in double complex, but don't generate the single and real variants for now
////////////////////////////////////////////////////////////////////////////////////////////////
void gemmStridedBatched(int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // pointer list to matrices
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn,
int batchCount)
{
// Use C-row major storage, so transpose calls
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
int sda = m*k;
int sdb = k*n;
int sdc = m*n;
deviceVector<ComplexD> alpha_p(1);
deviceVector<ComplexD> beta_p(1);
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
#ifdef GRID_HIP
auto err = hipblasZgemmStridedBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda, sda,
(hipblasDoubleComplex *) Bkn, ldb, sdb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc, sdc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasZgemmStridedBatched(gridblasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda, sda,
(cuDoubleComplex *) Bkn, ldb, sdb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc, sdc,
batchCount);
#endif
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
oneapi::mkl::transpose::N,
oneapi::mkl::transpose::N,
m,n,k,
alpha,
(const ComplexD *)Amk,lda,sda,
(const ComplexD *)Bkn,ldb,sdb,
beta,
(ComplexD *)Cmn,ldc,sdc,
batchCount);
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
}
}
}
#endif
}
double benchmark(int M, int N, int K, int BATCH) double benchmark(int M, int N, int K, int BATCH)
{ {
int32_t N_A = M*K*BATCH; int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH; int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH; int32_t N_C = M*N*BATCH;
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
ComplexD alpha(1.0); CComplex alpha(1.0);
ComplexD beta (1.0); CComplex beta (1.0);
RealD flops = 8.0*M*N*K*BATCH; RealD flops = 8.0*M*N*K*BATCH;
int ncall=10; int ncall=1000;
deviceVector<CComplex *> As(BATCH);
deviceVector<CComplex *> Bs(BATCH);
deviceVector<CComplex *> Cs(BATCH);
for(int b = 0 ; b < BATCH;b++) {
CComplex *ptr;
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
}
// Warm up call
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
RealD t0 = usecond(); RealD t0 = usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
gemmStridedBatched(M,N,K, gemmBatched(M,N,K,
alpha, alpha,
&A[0], // m x k As, // m x k
&B[0], // k x n Bs, // k x n
beta, beta,
&C[0], // m x n Cs);
BATCH);
}
synchronise(); synchronise();
}
RealD t1 = usecond(); RealD t1 = usecond();
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH; RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall; flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3; flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops return flops; // Returns gigaflops
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@@ -447,10 +447,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw, nbasis,nrhs,vw,
ComplexD(1.0), scalar(1.0),
Vd, Vd,
Fd, Fd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl; // std::cout << "BlockProject done"<<std::endl;
@@ -497,10 +497,10 @@ public:
int64_t vw = block_vol * words; int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis, vw,nrhs,nbasis,
ComplexD(1.0), scalar(1.0),
Vd, Vd,
Cd, Cd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Fd); Fd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << " blas call done"<<std::endl; // std::cout << " blas call done"<<std::endl;

View File

@@ -182,10 +182,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw, nev,nrhs,vw,
ComplexD(1.0), scalar(1.0),
Ed, Ed,
Rd, Rd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
@@ -210,10 +210,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev, vw,nrhs,nev,
ComplexD(1.0), scalar(1.0),
Ed, // x . nev Ed, // x . nev
Cd, // nev . nrhs Cd, // nev . nrhs
ComplexD(0.0), scalar(0.0),
Gd); Gd);
BLAS.synchronise(); BLAS.synchronise();

View File

@@ -53,6 +53,7 @@ class TwoLevelCGmrhs
// Fine operator, Smoother, CoarseSolver // Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop; LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother; LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer; GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer; GridStopWatch PromoteTimer;
@@ -62,7 +63,12 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer; GridStopWatch SmoothTimer;
GridStopWatch InsertTimer; GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions // more most opertor functions
TwoLevelCGmrhs(RealD tol, TwoLevelCGmrhs(RealD tol,
Integer maxit, Integer maxit,
@@ -73,12 +79,313 @@ class TwoLevelCGmrhs
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _FineLinop(FineLinop),
_Smoother(Smoother) _Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{ {
grid = fine; grid = fine;
}; };
// Vector case // Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x) virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
assert(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{ {
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl; std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier(); src[0].Grid()->Barrier();
@@ -361,15 +668,26 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs); CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs); CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
for(int rhs=0;rhs<nrhs;rhs++) { // this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start(); this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]); this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop(); this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start(); this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]); this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop(); this->FineTimer.Stop();
@@ -401,9 +719,11 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min] this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop(); this->PromoteTimer.Stop();
this->FineTimer.Start(); this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
} }
// this->zzz=out[0];
this->FineTimer.Stop(); this->FineTimer.Stop();
} }
}; };

View File

@@ -31,6 +31,58 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related // Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R) const std::vector<Field> & R)
{ {
InnerProductMatrix(m_rr,R,R); InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
@@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog); sliceNorm(ssq,B,Orthog);
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog); sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q; D=Q;
@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
RealD max_resid=0;
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/ */
m_rr = m_C.adjoint() * m_C; m_rr = m_C.adjoint() * m_C;
RealD max_resid=0; max_resid=0;
RealD rrsum=0; RealD rrsum=0;
RealD rr; RealD rr;
@@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
} }
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
@@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
IterationsToComplete = k; IterationsToComplete = k;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation: // BlockCGrQvec implementation:
//-------------------------- //--------------------------
@@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) { for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]); Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b]; tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
} }
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);

View File

@@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
@@ -57,10 +58,22 @@ public:
ErrorOnNoConverge(err_on_no_conv) ErrorOnNoConverge(err_on_no_conv)
{}; {};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient"); GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer; GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start(); PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
@@ -70,14 +83,19 @@ public:
//RealD b_pred; //RealD b_pred;
// Was doing copies // Was doing copies
ConstructTimer.Start();
Field p (src.Grid()); Field p (src.Grid());
Field mmp(src.Grid()); Field mmp(src.Grid());
Field r (src.Grid()); Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up // Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src); ssq = norm2(src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
NormTimer.Stop();
assert(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) { if ( guess == 0.0 ) {
r = src; r = src;
p = r; p = r;
@@ -89,6 +107,7 @@ public:
a = norm2(p); a = norm2(p);
} }
cp = a; cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src // Handle trivial case of zero src
if (ssq == 0.){ if (ssq == 0.){
@@ -164,6 +183,7 @@ public:
} }
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop(); IterationTimer.Stop();
if ( (k % 500) == 0 ) { if ( (k % 500) == 0 ) {
@@ -220,6 +240,9 @@ public:
<<" residual "<< std::sqrt(cp / ssq)<< std::endl; <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop(); SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl; std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
@@ -233,5 +256,118 @@ public:
} }
}; };
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@@ -102,11 +102,11 @@ public:
assert(mass.size()==nshift); assert(mass.size()==nshift);
assert(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // remove dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;

View File

@@ -123,11 +123,11 @@ public:
assert(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD rsqf[nshift]; std::vector<RealD> rsqf(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;

View File

@@ -156,11 +156,11 @@ public:
assert(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD rsqf[nshift]; std::vector<RealD> rsqf(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;

View File

@@ -288,7 +288,7 @@ public:
//---------------------------------------------------------------------- //----------------------------------------------------------------------
if ( Nm>Nk ) { if ( Nm>Nk ) {
Glog <<" #Apply shifted QR transformations "<<std::endl; // Glog <<" #Apply shifted QR transformations "<<std::endl;
//int k2 = Nk+Nu; //int k2 = Nk+Nu;
int k2 = Nk; int k2 = Nk;
@@ -298,6 +298,7 @@ public:
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
for(int ip=Nk; ip<Nm; ++ip){ for(int ip=Nk; ip<Nm; ++ip){
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
} }
@@ -467,10 +468,10 @@ public:
// set initial vector // set initial vector
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
// Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
evec[i] = src[i]; evec[i] = src[i];
orthogonalize(evec[i],evec,i); orthogonalize(evec[i],evec,i);
// Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
} }
// exit(-43); // exit(-43);
@@ -506,11 +507,11 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nr,Nr); Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
_sort.push(eval2,Nr); _sort.push(eval2,Nr);
// Glog << "#Ritz value: "<< std::endl; Glog << "#Ritz value: "<< std::endl;
for(int i=0; i<Nr; ++i){ for(int i=0; i<Nr; ++i){
// std::cout.precision(13); std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
} }
// Convergence test // Convergence test
@@ -570,6 +571,7 @@ public:
Glog << fname + " NOT converged ; Summary :\n"; Glog << fname + " NOT converged ; Summary :\n";
} else { } else {
Glog << fname + " CONVERGED ; Summary :\n"; Glog << fname + " CONVERGED ; Summary :\n";
Nstop = Nconv_guess; // Just take them all
// Sort convered eigenpairs. // Sort convered eigenpairs.
std::vector<Field> Btmp(Nstop,grid); // waste of space replicating std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
@@ -951,7 +953,7 @@ if (1){
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
} }
} }
//Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; // Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
} }
@@ -977,7 +979,7 @@ if (1){
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
} }
} }
//Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; // Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
} }
@@ -1002,6 +1004,7 @@ if (1){
// lower triangular part used to represent series // lower triangular part used to represent series
// of Q sequence. // of Q sequence.
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
// equivalent operation of Qprod *= Q // equivalent operation of Qprod *= Q
//M = Eigen::MatrixXcd::Zero(Nm,Nm); //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1022,6 +1025,7 @@ if (1){
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=0; j<Nm-(Nu+1); ++j) { for (int j=0; j<Nm-(Nu+1); ++j) {
for (int k=0; k<Nu+1+j; ++k) { for (int k=0; k<Nu+1+j; ++k) {
@@ -1029,6 +1033,7 @@ if (1){
} }
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=Nm-(Nu+1); j<Nm; ++j) { for (int j=Nm-(Nu+1); j<Nm; ++j) {
for (int k=0; k<Nm; ++k) { for (int k=0; k<Nm; ++k) {
@@ -1036,6 +1041,7 @@ if (1){
} }
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
//static int ntimes = 2; //static int ntimes = 2;
//for (int j=0; j<Nm-(ntimes*Nu); ++j) { //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1061,11 +1067,13 @@ if (1){
Mtmp(j,i) = conj(Mtmp(i,j)); Mtmp(j,i) = conj(Mtmp(i,j));
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
M = Mtmp; M = Mtmp;
//M = Q.adjoint()*(M*Q); //M = Q.adjoint()*(M*Q);
@@ -1077,7 +1085,7 @@ if (1){
// } // }
//} //}
//Glog << "shiftedQRDecompEigen() end" << endl; // Glog << "shiftedQRDecompEigen() end" <<std::endl;
} }
void exampleQRDecompEigen(void) void exampleQRDecompEigen(void)

View File

@@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0); // std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; // std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. // RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
src_n = tmp; src_n = tmp;
} }
} }
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm); std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm); std::vector<RealD> lme2(Nm);

View File

@@ -60,6 +60,32 @@ public:
} }
}; };
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> { template<class Field> class HPDSolver : public LinearFunction<Field> {
private: private:
LinearOperatorBase<Field> & _Matrix; LinearOperatorBase<Field> & _Matrix;

View File

@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
const int _MAX_ITER_EST_ = 100; const int _MAX_ITER_EST_ = 200;
for (int i=0;i<_MAX_ITER_EST_;i++) { for (int i=0;i<_MAX_ITER_EST_;i++) {
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { // if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na; // evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; // return evalMaxApprox;
return evalMaxApprox; // }
}
evalMaxApprox = na; evalMaxApprox = na;
src_n = tmp; src_n = tmp;
} }
assert(0); std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return 0; return evalMaxApprox;
} }
}; };
} }

View File

@@ -0,0 +1,76 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi=Zero(); // psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;

View File

@@ -499,6 +499,87 @@ namespace Grid {
} }
}; };
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv // Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta

View File

@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x) inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:
RealD scale; RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false); ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid); FineField noise(FineGrid);
FineField Mn(FineGrid); FineField Mn(FineGrid);
@@ -108,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl; hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){ for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]); CG(hermop,noise,subspace[b]);
@@ -124,6 +126,53 @@ public:
} }
} }
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
@@ -160,14 +209,21 @@ public:
int b =0; int b =0;
{ {
ComplexD ip;
// Filter // Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter); Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn); Cheb(hermop,noise,Mn);
// normalise // normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -213,8 +269,18 @@ public:
Mn=*Tnp; Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -228,6 +294,70 @@ public:
} }
assert(b==nn); assert(b==nn);
} }
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,

View File

@@ -99,7 +99,7 @@ public:
CoarseMatrix AselfInvEven; CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd; CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor; deviceVector<RealD> dag_factor;
/////////////////////// ///////////////////////
// Interface // Interface
@@ -124,9 +124,13 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -161,7 +165,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}; };
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -190,9 +194,14 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -201,10 +210,10 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0); deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) for(int p=0; p<geom.npoint; p++) {
points[p] = geom.points_dagger[p]; acceleratorPut(points[p],geom.points_dagger[p]);
}
auto points_p = &points[0]; auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0]; RealD* dag_factor_p = &dag_factor[0];
@@ -236,7 +245,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@@ -251,8 +260,14 @@ public:
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
@@ -285,7 +300,7 @@ public:
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@@ -469,14 +484,20 @@ public:
// determine in what order we need the points // determine in what order we need the points
int npoint = geom.npoint-1; int npoint = geom.npoint-1;
Vector<int> points(npoint, 0); deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) for(int p=0; p<npoint; p++) {
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p; int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0]; auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer; deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead)); hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -539,7 +560,7 @@ public:
}); });
} }
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@@ -590,11 +611,13 @@ public:
} }
// GPU readable prefactor // GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, { thread_for(i, nbasis*nbasis, {
int j = i/nbasis; int j = i/nbasis;
int k = i%nbasis; int k = i%nbasis;
dag_factor[i] = dag_factor_eigen(j, k); h_dag_factor[i] = dag_factor_eigen(j, k);
}); });
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
} }
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,

View File

@@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl; std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
} }
#else #else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace) Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{ {
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid(); GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid()); CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace); blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint; const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions(); Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl; std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond(); tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i]; phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond(); tphaseBZ+=usecond();
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl; // std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond(); tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace); blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner; coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner; ComputeProj[p] = coarseInner;

View File

@@ -54,6 +54,9 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -66,7 +69,7 @@ public:
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
@@ -100,6 +103,9 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -145,6 +151,9 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -165,19 +174,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
// Cshift on device template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using cshiftAllocator = devAllocator<T>; template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
#else template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
/* /*
template<class T> class vecView template<class T> class vecView
@@ -188,8 +188,9 @@ template<class T> class vecView
ViewMode mode; ViewMode mode;
void * cpu_ptr; void * cpu_ptr;
public: public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(std::vector<T> &refer_to_me,ViewMode _mode) vecView(Vector<T> &refer_to_me,ViewMode _mode)
{ {
cpu_ptr = &refer_to_me[0]; cpu_ptr = &refer_to_me[0];
size = refer_to_me.size(); size = refer_to_me.size();
@@ -205,22 +206,12 @@ template<class T> class vecView
} }
}; };
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode) template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{ {
vecView<T> ret(vec,_mode); // does the open vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed return ret; // must be closed
} }
// Little autoscope assister
template<class View>
class VectorViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
VectorViewCloser(View &_v) : v(_v) {};
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
};
#define autoVecView(v_v,v,mode) \ #define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \ auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v); ViewCloser<decltype(v_v)> _autoView##v_v(v_v);

View File

@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
uint64_t total_host;; uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void) void MemoryManager::PrintBytes(void)
{ {
std::cout << " MemoryManager : ------------------------------------ "<<std::endl; std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
#ifdef GRID_CUDA #ifdef GRID_CUDA
cuda_mem(); cuda_mem();
#endif #endif
DisplayMallinfo();
} }
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }

View File

@@ -211,6 +211,7 @@ private:
#endif #endif
public: public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr); static void NotifyDeletion(void * CpuPtr);
static void Print(void); static void Print(void);
static void PrintAll(void); static void PrintAll(void);

View File

@@ -1,16 +1,15 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...) //#define dprintf(...)
//#define mprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL; AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; if (AccCache.accLock!=0) return;
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
@@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes); assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else { } else {
assert(0); assert(0);
} }
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// If view is opened on device must remove from LRU // If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n"); dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache); LRUremove(AccCache);
} }
@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else { } else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)

View File

@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
uint64_t virt_pfn = (uint64_t)Buf / page_size; uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn; off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size; uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages]; std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET); uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset); assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages); assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512; int nhugepages = npages / 512;
int n4ktotal, nnothuge; int n4ktotal, nnothuge;

View File

@@ -31,5 +31,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cartesian/Cartesian_base.h> #include <Grid/cartesian/Cartesian_base.h>
#include <Grid/cartesian/Cartesian_full.h> #include <Grid/cartesian/Cartesian_full.h>
#include <Grid/cartesian/Cartesian_red_black.h> #include <Grid/cartesian/Cartesian_red_black.h>
#include <Grid/cartesian/CartesianCrossIcosahedron.h>
#endif #endif

View File

@@ -0,0 +1,207 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
Copyright (C) 2025
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
enum IcosahedralMeshType {
IcosahedralVertices,
IcosahedralEdges
} ;
enum NorthSouth {
North = 1,
South = 0
};
const int IcosahedralPatches = 10;
const int HemiPatches=IcosahedralPatches/2;
const int NorthernHemisphere = HemiPatches;
const int SouthernHemisphere = 0;
class GridCartesianCrossIcosahedron: public GridCartesian {
public:
IcosahedralMeshType meshType;
IcosahedralMeshType MeshType(void) { return meshType; };
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
/*
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
assert(0); // No subdivision
}
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
assert(0); // No subdivision
}
*/
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesianCrossIcosahedron(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
{
meshType = _meshType;
Coordinate S2dimensions=dimensions;
Coordinate S2simd =simd_layout;
Coordinate S2procs =processor_grid;
assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
assert(dimensions[_ndimension-1]==IcosahedralPatches);
assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
// Save a copy of the basic cartesian initialisation volume
cartesianOsites = this->_osites;
// allocate the pole storage if we are seeking vertex domain data
if ( meshType == IcosahedralVertices ) {
InitPoles();
}
}
virtual ~GridCartesianCrossIcosahedron() = default;
////////////////////////////////////////////////
// Use to decide if a given grid is icosahedral
////////////////////////////////////////////////
int hasNorthPole;
int hasSouthPole;
int northPoleOsite;
int southPoleOsite;
int northPoleOsites;
int southPoleOsites;
int cartesianOsites;
virtual int isIcosahedral(void) override { return 1;}
virtual int isIcosahedralVertex(void) override { return meshType==IcosahedralVertices;}
virtual int isIcosahedralEdge (void) override { return meshType==IcosahedralEdges;}
virtual int NorthPoleOsite(void) const override { return northPoleOsite; };
virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
virtual int SouthPoleOsite(void) const override { return southPoleOsite; };
virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
virtual int ownsNorthPole(void) const override { return hasNorthPole; };
virtual int ownsSouthPole(void) const override { return hasSouthPole; };
virtual int CartesianOsites(void) const override { return cartesianOsites; };
void InitPoles(void)
{
int Ndm1 = _ndimension-1;
///////////////////////
// Add the extra pole storage
///////////////////////
// Vertices = 1x LxLx D1...Dn + 2.D1...Dn
// Start after the LxL and don't include the 10 patch dim
int OrthogSize = 1;
for (int d = 2; d < Ndm1; d++) {
OrthogSize *= _gdimensions[d];
}
_fsites += OrthogSize*2;
_gsites += OrthogSize*2;
// Simd reduced sizes are multiplied up.
// If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
// which should contain identical north (south) pole data
OrthogSize = 1;
for (int d = 2; d < Ndm1; d++) {
OrthogSize *= _rdimensions[d];
}
// Grow the local volume to hold pole data
// on rank (0,0) in the LxL planes
// since SIMD must be placed in the orthogonal directions
Coordinate pcoor = this->ThisProcessorCoor();
Coordinate pgrid = this->ProcessorGrid();
const int xdim=0;
const int ydim=1;
/*
*
* /\/\/\/\/\
* /\/\/\/\/\/
* \/\/\/\/\/
*
* y
* /
* \x
*
* Labelling patches as 5 6 7 8 9
* 0 1 2 3 4
*
* Will ban distribution of the patch dimension by more than 2.
*
* Hence all 5 patches associated with the pole must have the
* appropriate "corner" of the patch L^2 located on the SAME rank.
*/
if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
hasSouthPole =1;
southPoleOsite=this->_osites;
southPoleOsites=OrthogSize;
this->_osites += OrthogSize;
} else {
hasSouthPole =0;
southPoleOsites=0;
southPoleOsite=0;
}
if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
hasNorthPole =1;
northPoleOsite=this->_osites;
northPoleOsites=OrthogSize;
this->_osites += OrthogSize;
} else {
hasNorthPole =0;
northPoleOsites=0;
northPoleOsite=0;
}
std::cout << "Icosahedral vertex field volume " << this->_osites<<std::endl;
std::cout << "Icosahedral south pole offset " << this->southPoleOsite<<std::endl;
std::cout << "Icosahedral north pole offset " << this->northPoleOsite<<std::endl;
std::cout << "Icosahedral south pole size " << this->southPoleOsites<<std::endl;
std::cout << "Icosahedral north pole size " << this->northPoleOsites<<std::endl;
};
};
NAMESPACE_END(Grid);

View File

@@ -82,13 +82,27 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim;
public: public:
// Icosahedral decisions
virtual int isIcosahedral(void) { return 0;}
virtual int isIcosahedralVertex(void) { return 0;}
virtual int isIcosahedralEdge (void) { return 0;}
virtual int ownsNorthPole(void) const { return 0; };
virtual int ownsSouthPole(void) const { return 0; };
virtual int NorthPoleOsite(void) const { return 0; };
virtual int SouthPoleOsite(void) const { return 0; };
virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
virtual int CartesianOsites(void) const { return this->oSites(); };
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by // Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian // GridCartesian / GridRedBlackCartesian
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim) =0; virtual int CheckerBoarded(int dim) =0;
virtual int CheckerBoard(const Coordinate &site)=0; virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
@@ -175,6 +189,8 @@ public:
} }
return permute_type; return permute_type;
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Array sizing queries // Array sizing queries
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////

View File

@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -106,6 +106,7 @@ public:
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);; _checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);

View File

@@ -57,9 +57,10 @@ class GridRedBlackCartesian : public GridBase
{ {
public: public:
// Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; // int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;

View File

@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c) void CartesianCommunicator::GlobalSum(ComplexD &c)
{ {
GlobalSumVector((double *)&c,2); GlobalSumVector((double *)&c,2);
} }
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);

View File

@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ; extern bool Stencil_force_mpi ;
@@ -128,6 +130,35 @@ public:
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
@@ -138,8 +169,8 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list); void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -158,6 +189,17 @@ public:
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,

View File

@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
//////////////////////////////////////////// ////////////////////////////////////////////
@@ -257,15 +258,41 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(uint64_t &u){ void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
@@ -287,27 +314,18 @@ void CartesianCommunicator::GlobalMax(double &d)
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N) void CartesianCommunicator::GlobalSumVector(float *f,int N)
{ {
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -332,7 +350,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@@ -351,9 +369,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from, int from,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
@@ -369,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
assert(ierr==0); assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -381,12 +394,25 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
#undef NVLINK_GET // Define to use get instead of put DMA
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
@@ -420,14 +446,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
#ifdef NVLINK_GET #ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif #endif
} }
// This is a NVLINK PUT
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -440,27 +467,341 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif #endif
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
} }
} }
return off_node_bytes; return off_node_bytes;
} }
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
acceleratorCopySynchronise(); std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
if (nreq==0) return; for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
std::vector<MPI_Status> status(nreq); if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); // if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
assert(ierr==0);
list.resize(0);
} }
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);
} }
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -468,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
//} //}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
@@ -491,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
@@ -513,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
} }
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{ {
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments // MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data. // 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.

View File

@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);} void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
@@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{ {
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,

View File

@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
@@ -105,7 +137,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes); // static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
HostCommBuf= malloc(bytes); // printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
@@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
@@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) //#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
#else //#else
bcopy(src,dest,bytes); // bcopy(src,dest,bytes);
#endif //#endif
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2]=magic; check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t)); acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier(); acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode); assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r); assert(check[1]==r);
assert(check[2]==magic); assert(check[2]==magic);
ShmBarrier();
} }
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)

View File

@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
acceleratorMemSet(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality

View File

@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{ {

View File

@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table; extern std::vector<std::pair<int,int> > Cshift_table;
extern commVector<std::pair<int,int> > Cshift_table_device; extern deviceVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void) inline std::pair<int,int> *MapCshiftTable(void)
{ {
// GPU version // GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size(); uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) { if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz); Cshift_table_device.resize(sz);
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
sizeof(Cshift_table[0])*sz); sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0]; return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map // CPU version use identify map
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite); autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite); autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite); autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }

View File

@@ -31,9 +31,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
assert(!rhs.Grid()->isIcosahedral());
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@@ -65,7 +67,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
Cshift_comms(ret,rhs,dimension,shift); Cshift_comms(ret,rhs,dimension,shift);
} }
t1=usecond(); t1=usecond();
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret; return ret;
} }
@@ -94,7 +96,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3); Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@@ -104,8 +106,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -125,8 +125,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size); static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size); static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -158,18 +162,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
// grid->Barrier(); grid->Barrier();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
xbytes+=bytes; xbytes+=bytes;
// grid->Barrier(); grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
tscatter-=usecond(); tscatter-=usecond();
@@ -177,13 +194,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tscatter+=usecond(); tscatter+=usecond();
} }
} }
/* if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/ }
} }
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -224,8 +241,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi; scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi; scalar_object * send_buf_extract_mpi;
@@ -233,6 +250,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract[s].resize(buffer_size); send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -281,246 +302,31 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
// grid->Barrier(); grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi, grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
#else #else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) // bouncy bouncy
{ acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
typedef typename vobj::vector_type vector_type; grid->SendToRecvFrom((void *)&hsend_buf[0],
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&hrecv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
xbytes+=bytes; acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); #endif
// grid->Barrier(); xbytes+=bytes;
grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
rpointers[i] = &send_buf_extract[nbr_lane][0]; rpointers[i] = &send_buf_extract[nbr_lane][0];
@@ -530,17 +336,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
/* if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
} }
#endif }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
assert(!rhs.Grid()->isIcosahedral());
Lattice<vobj> ret(rhs.Grid()); Lattice<vobj> ret(rhs.Grid());
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
Cshift_local(ret,rhs,dimension,shift); Cshift_local(ret,rhs,dimension,shift);

View File

@@ -1,5 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table; std::vector<std::pair<int,int> > Cshift_table;
commVector<std::pair<int,int> > Cshift_table_device; deviceVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product

View File

@@ -237,16 +237,19 @@ public:
vobj vtmp; vobj vtmp;
vtmp = r; vtmp = r;
#if 1 #if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp);
});
#else
auto me = View(CpuWrite); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss]= r; me[ss]= r;
}); });
#else
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(vtmp);
coalescedWrite(me[ss],stmp);
});
#endif #endif
me.ViewClose(); me.ViewClose();
return *this; return *this;
@@ -370,14 +373,17 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
for(int64_t g=0;g<o.Grid()->_gsites;g++){ uint64_t gsites=1;
uint64_t polesites=0;
for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
for(int64_t g=0;g<gsites;g++){
Coordinate gcoor; Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss; sobj ss;
peekSite(ss,o,gcoor); peekSite(ss,o,gcoor);
stream<<"["; stream<<"["<< g<<" : ";
for(int d=0;d<gcoor.size();d++){ for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d]; stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<","; if(d!=gcoor.size()-1) stream<<",";
@@ -385,6 +391,41 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
stream<<"]\t"; stream<<"]\t";
stream<<ss<<std::endl; stream<<ss<<std::endl;
} }
if ( o.Grid()->isIcosahedralVertex() ) {
uint64_t psites=1;
Coordinate perpdims;
for(int d=2;d<o.Grid()->_ndimension-1;d++){
int pd=o.Grid()->_gdimensions[d];
psites*=pd;
perpdims.push_back(pd);
}
for(uint64_t p=0;p<psites;p++){
sobj ss;
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
peekPole(ss,o,orthog,South);
stream<<"[ SouthPole : ";
for(int d=0;d<orthog.size();d++){
stream<<orthog[d];
if(d!=orthog.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
for(uint64_t p=0;p<psites;p++){
sobj ss;
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
peekPole(ss,o,orthog,North);
stream<<"[ NorthPole : ";
for(int d=0;d<orthog.size();d++){
stream<<orthog[d];
if(d!=orthog.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
}
return stream; return stream;
} }

View File

@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size()); hostVector<View> h_basis_v(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj; deviceVector<View> d_basis_v(basis.size());
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite)); h_basis_v[k] = basis[k].View(AcceleratorWrite);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) View *basis_vp = &d_basis_v[0];
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot); deviceVector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm); hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
Qt_p[i]=Qt(j,k); h_Qt_jv[i]=Qt(j,k);
}); });
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size()); hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead)); h_basis_v[k]=basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0]; vobj zz=Zero();
deviceVector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
auto basis_vp=& d_basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); vobj zzz=Zero();
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
} }
template<class Field> template<class Field>

View File

@@ -34,11 +34,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
typedef typename iobj::scalar_type scalar_type; typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type; typedef typename iobj::vector_type vector_type;
l=Zero();
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
int cartesian_vol = grid->oSites();
if ( grid->isIcosahedral() ) {
cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
}
{
autoView(l_v, l, CpuWrite); autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), { thread_for( o, cartesian_vol, {
vector_type vI; vector_type vI;
Coordinate gcoor; Coordinate gcoor;
ExtractBuffer<scalar_type> mergebuf(Nsimd); ExtractBuffer<scalar_type> mergebuf(Nsimd);
@@ -49,7 +56,64 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
merge<vector_type,scalar_type>(vI,mergebuf); merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI; l_v[o]=vI;
}); });
}
if (grid->isIcosahedralVertex()) {
uint64_t psites=1;
Coordinate perpdims;
typename iobj::scalar_object ss;
for(int d=2;d<grid->_ndimension-1;d++){
int pd=grid->_gdimensions[d];
psites*=pd;
perpdims.push_back(pd);
}
for(uint64_t p=0;p<psites;p++){
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
int icoor;
if ( mu>=2 && mu < grid->_ndimension-1) {
icoor = orthog[mu-2];
} else {
icoor = -1;
}
ss=scalar_type(icoor);
pokePole(ss,l,orthog,South);
pokePole(ss,l,orthog,North);
}
}
};
template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
{
typedef typename iobj::scalar_object sobj;
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
GridBase *grid = l.Grid();
l=Zero();
assert(grid->isIcosahedralVertex());
if (grid->isIcosahedralVertex()) {
uint64_t psites=1;
Coordinate perpdims;
sobj ss;
scalar_type one(1.0);
ss=one;
for(int d=2;d<l.Grid()->_ndimension-1;d++){
int pd=l.Grid()->_gdimensions[d];
psites*=pd;
perpdims.push_back(pd);
}
for(uint64_t p=0;p<psites;p++){
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
pokePole(ss,l,orthog,pole);
}
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite); autoView( l_v , l, CpuRead);
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@@ -151,6 +151,261 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return; return;
}; };
// zero for south pole, one for north pole
template<class vobj,class sobj>
void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
{
s=Zero();
GridBase *grid=l.Grid();
assert(grid->isIcosahedral());
assert(grid->isIcosahedralVertex());
int Nsimd = grid->Nsimd();
int rank;
int Ndm1 = grid->_ndimension-1;
Coordinate pgrid = grid->ProcessorGrid();
const int xdim=0;
const int ydim=1;
const int pdim=Ndm1;
int64_t pole_osite;
int64_t pole_isite;
Coordinate rdims;
Coordinate idims;
Coordinate ocoor;
Coordinate icoor;
Coordinate pcoor(grid->_ndimension);
for(int d=2;d<Ndm1;d++){
int dd=d-2;
rdims.push_back(grid->_rdimensions[d]);
idims.push_back(grid->_simd_layout[d]);
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
}
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
int64_t osite;
if(isNorth == North){
pcoor[xdim] = 0;
pcoor[ydim] = pgrid[ydim]-1;
pcoor[Ndm1] = pgrid[Ndm1]-1;
osite = pole_osite + grid->NorthPoleOsite();
} else {
pcoor[xdim] = pgrid[xdim]-1;
pcoor[ydim] = 0;
pcoor[Ndm1] = 0;
osite = pole_osite + grid->SouthPoleOsite();
}
rank = grid->RankFromProcessorCoor(pcoor);
if ( rank == grid->ThisRank() ) {
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
extract(l_v[osite],buf);
s = buf[pole_isite];
}
grid->Broadcast(rank,s);
return;
};
template<class vobj,class sobj>
void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
{
GridBase *grid=l.Grid();
assert(grid->isIcosahedral());
assert(grid->isIcosahedralVertex());
grid->Broadcast(grid->BossRank(),s);
int Nsimd = grid->Nsimd();
int rank;
int Ndm1 = grid->_ndimension-1;
Coordinate pgrid = grid->ProcessorGrid();
const int xdim=0;
const int ydim=1;
const int pdim=Ndm1;
int64_t pole_osite;
int64_t pole_isite;
Coordinate rdims;
Coordinate idims;
Coordinate ocoor;
Coordinate icoor;
Coordinate pcoor(grid->_ndimension,0);
for(int d=2;d<Ndm1;d++){
int dd = d-2;
rdims.push_back(grid->_rdimensions[d]);
idims.push_back(grid->_simd_layout[d]);
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
int o = orthog[dd];
int r = grid->_rdimensions[d];
int omr = o % r;
}
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
int64_t osite;
if(isNorth ==North){
pcoor[xdim] = 0;
pcoor[ydim] = pgrid[ydim]-1;
pcoor[Ndm1] = pgrid[Ndm1]-1;
osite = pole_osite + grid->NorthPoleOsite();
} else {
pcoor[xdim] = pgrid[xdim]-1;
pcoor[ydim] = 0;
pcoor[Ndm1] = 0;
osite = pole_osite + grid->SouthPoleOsite();
}
rank = grid->RankFromProcessorCoor(pcoor);
// extract-modify-merge cycle is easiest way and this is not perf critical
if ( rank == grid->ThisRank() ) {
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
extract(l_v[osite],buf);
buf[pole_isite] = s;
merge(l_v[osite],buf);
}
return;
};
template<class vobj,class sobj>
void peekLocalPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
{
s=Zero();
GridBase *grid=l.Grid();
assert(grid->isIcosahedral());
assert(grid->isIcosahedralVertex());
int Nsimd = grid->Nsimd();
int rank;
int Ndm1 = grid->_ndimension-1;
Coordinate pgrid = grid->ProcessorGrid();
const int xdim=0;
const int ydim=1;
const int pdim=Ndm1;
int64_t pole_osite;
int64_t pole_isite;
Coordinate rdims;
Coordinate idims;
Coordinate ocoor;
Coordinate icoor;
// Coordinate pcoor(grid->_ndimension);
for(int d=2;d<Ndm1;d++){
int dd=d-2;
rdims.push_back(grid->_rdimensions[d]);
idims.push_back(grid->_simd_layout[d]);
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
// pcoor[d] = orthog[dd]/grid->_ldimensions[d];
}
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
int64_t osite;
if(isNorth == North){
// pcoor[xdim] = 0;
// pcoor[ydim] = pgrid[ydim]-1;
// pcoor[Ndm1] = pgrid[Ndm1]-1;
osite = pole_osite + grid->NorthPoleOsite();
assert(grid->ownsNorthPole());
} else {
// pcoor[xdim] = pgrid[xdim]-1;
// pcoor[ydim] = 0;
// pcoor[Ndm1] = 0;
osite = pole_osite + grid->SouthPoleOsite();
assert(grid->ownsSouthPole());
}
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
extract(l_v[osite],buf);
s = buf[pole_isite];
return;
};
template<class vobj,class sobj>
void pokeLocalPole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
{
GridBase *grid=l.Grid();
assert(grid->isIcosahedral());
assert(grid->isIcosahedralVertex());
int Nsimd = grid->Nsimd();
int rank;
int Ndm1 = grid->_ndimension-1;
const int xdim=0;
const int ydim=1;
const int pdim=Ndm1;
int64_t pole_osite;
int64_t pole_isite;
Coordinate rdims;
Coordinate idims;
Coordinate ocoor;
Coordinate icoor;
// Coordinate pcoor(grid->_ndimension,0);
for(int d=2;d<Ndm1;d++){
int dd = d-2;
rdims.push_back(grid->_rdimensions[d]);
idims.push_back(grid->_simd_layout[d]);
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
// pcoor[d] = orthog[dd]/grid->_ldimensions[d];
int o = orthog[dd];
int r = grid->_rdimensions[d];
int omr = o % r;
}
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
int64_t osite;
int insert=0;
if(isNorth ==North){
// pcoor[xdim] = 0;
// pcoor[ydim] = pgrid[ydim]-1;
// pcoor[Ndm1] = pgrid[Ndm1]-1;
osite = pole_osite + grid->NorthPoleOsite();
assert(grid->ownsNorthPole());
} else {
// pcoor[xdim] = pgrid[xdim]-1;
// pcoor[ydim] = 0;
// pcoor[Ndm1] = 0;
osite = pole_osite + grid->SouthPoleOsite();
assert(grid->ownsSouthPole());
}
// extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
extract(l_v[osite],buf);
buf[pole_isite] = s;
merge(l_v[osite],buf);
return;
};
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
@@ -165,7 +420,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); // assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -202,7 +457,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); // assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);

View File

@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. // Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
});
}
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t; typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{ {
@@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
}); });
} }
#endif
// This is in single precision and fails some tests // This is in single precision and fails some tests
auto anrm = sumD(inner_tmp_v,sites); auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm; nrm = anrm;
@@ -307,8 +290,10 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t csum=0; uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{ {
// Hack // Hack
@@ -317,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0]; uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words); csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
} }
FlightRecorder::CsumLog(csum);
#endif #endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right); ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm); RealD local = real(nrm);
FlightRecorder::NormLog(real(nrm)); ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -360,20 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
@@ -381,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif #endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -393,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right); conformable(left,right);
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2); std::vector<ComplexD> tmp(2);
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
@@ -403,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU // GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); deviceVector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{ {
@@ -454,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{ {
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// FIXME precision promoted summation // FIXME precision promoted summation
@@ -476,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first std::vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
@@ -525,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
scalar_type * ptr = (scalar_type *) &result[0]; scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
} }
template<class vobj> inline template<class vobj> inline
std::vector<typename vobj::scalar_object> std::vector<typename vobj::scalar_object>
@@ -535,7 +568,20 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result; return result;
} }
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
@@ -555,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first std::vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -686,203 +732,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
} }
}; };
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{ {
int NN = BlockSolverGrid->_ndimension; int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd(); int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0); std::vector<int> latt_phys(NN-1);
std::vector<int> simd_phys(0); Coordinate simd_phys;
std::vector<int> mpi_phys(0); std::vector<int> mpi_phys(NN-1);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){ for(int d=0;d<NN;d++){
if( d!=Orthog ) { if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); mpi_phys[dd] =BlockSolverGrid->_processors[d];
mpi_phys.push_back(BlockSolverGrid->_processors[d]); checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
} }
} }
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
} }
*/
template<class vobj> template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
GridBase *FullGrid = X.Grid();
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
RR = R; // Copies checkerboard for insert
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; for(int i=0;i<Nslice;i++){
ExtractSlice(Ys,Y,i,Orthog);
GridBase *FullGrid = X.Grid(); ExtractSlice(Rs,R,i,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); Rs=Ys;
for(int j=0;j<Nslice;j++){
// Lattice<vobj> Xslice(SliceGrid); ExtractSlice(Xs,X,j,Orthog);
// Lattice<vobj> Rslice(SliceGrid); Rs = Rs + Xs*(scale*aa(j,i));
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
} }
InsertSlice(Rs,RR,i,Orthog);
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
} }
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
}; };
template<class vobj> template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; R=Zero();
typedef typename vobj::vector_type vector_type; sliceMaddMatrix(R,aa,X,R,Orthog,scale);
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
}; };
template<class vobj> template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = lhs.Grid(); mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); for(int s=0;s<Nslice;s++){
ExtractSlice(ls,lhs,s,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; for(int ss=0;ss<Nslice;ss++){
ExtractSlice(rs,rhs,ss,Orthog);
// Lattice<vobj> Lslice(SliceGrid); mat(s,ss) = innerProduct(ls,rs);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
} }
} }
delete SliceGrid;
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
// Move out of UVM // Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream // Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise // as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER deviceVector<sobj> buffer(numBlocks);
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result; return result;
} }
@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites); deviceVector<vector> buffer(osites);
vector *dat = (vector *)lat; vector *dat = (vector *)lat;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0]; iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];

View File

@@ -4,32 +4,28 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum // Possibly promote to double and sum
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD; typedef typename vobj::scalar_objectD sobjD;
static Vector<sobj> mysum;
mysum.resize(1);
sobj *mysum_p = & mysum[0];
sobj identity; zeroit(identity); sobj identity; zeroit(identity);
sobj ret ; sobj ret; zeroit(ret);
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites}, cgh.parallel_for(sycl::range<1>{osites},
Reduction, Reduction,
[=] (cl::sycl::id<1> item, auto &sum) { [=] (sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
}); });
theGridAccelerator->wait(); }
ret = mysum[0];
// free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret); sobjD dret; convertType(dret,ret);
return dret; return dret;
} }
@@ -75,58 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L) template<class Word> Word svm_xor(Word *vec,uint64_t L)
{ {
Word xorResult; xorResult = 0;
static Vector<Word> d_sum;
d_sum.resize(1);
Word *d_sum_p=&d_sum[0];
Word identity; identity=0; Word identity; identity=0;
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); Word ret = 0;
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { {
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); sycl::buffer<Word, 1> abuff(&ret, {1});
cgh.parallel_for(cl::sycl::range<1>{L}, theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (cl::sycl::id<1> index, auto &sum) { [=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index]; sum ^=vec[index];
}); });
}); });
}
theGridAccelerator->wait(); theGridAccelerator->wait();
Word ret = d_sum[0];
// free(d_sum,*theGridAccelerator);
return ret; return ret;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@@ -47,6 +47,19 @@ NAMESPACE_BEGIN(Grid);
// Allow the RNG state to be less dense than the fine grid // Allow the RNG state to be less dense than the fine grid
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine) inline int RNGfillable(GridBase *coarse,GridBase *fine)
{
if ( coarse == fine ) return 1;
if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
assert(fine->Nd()==coarse->Nd());
for(int d=0;d<fine->Nd();d++){
assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
}
return 1;
}
{ {
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
@@ -74,12 +87,26 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
} }
return multiplicity; return multiplicity;
} }
}
// merge of April 11 2017 // merge of April 11 2017
// this function is necessary for the LS vectorised field // this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine) inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{ {
if ( coarse == fine ) return 1;
if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
assert(fine->Nd()==coarse->Nd());
for(int d=0;d<fine->Nd();d++){
assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
}
return 1;
}
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
@@ -352,12 +379,12 @@ private:
public: public:
GridBase *Grid(void) const { return _grid; } GridBase *Grid(void) const { return _grid; }
int generator_idx(int os,int is) { int generator_idx(int os,int is) {
return is*_grid->oSites()+os; return (is*_grid->CartesianOsites()+os)%_grid->lSites(); // On the pole sites wrap back to normal generators; Icosahedral hack
} }
GridParallelRNG(GridBase *grid) : GridRNGbase() { GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid = grid; _grid = grid;
_vol =_grid->iSites()*_grid->oSites(); _vol =_grid->lSites();
_generators.resize(_vol); _generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
@@ -381,7 +408,7 @@ public:
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->CartesianOsites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity, except on Icosahedral
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
autoView(l_v, l, CpuWrite); autoView(l_v, l, CpuWrite);
@@ -403,7 +430,26 @@ public:
merge(l_v[sm], buf); merge(l_v[sm], buf);
} }
}); });
// });
/*
* Fill in the poles for an Icosahedral vertex mesh
*/
if (l.Grid()->isIcosahedralVertex()) {
int64_t pole_sites=l.Grid()->NorthPoleOsites()+l.Grid()->SouthPoleOsites();
int64_t pole_base =l.Grid()->CartesianOsites();
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < pole_sites; m++) { // Draw from same generator multiplicity times
for (int si = 0; si < Nsimd; si++) {
int gdx = 0;
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
merge(l_v[pole_base+m], buf);
}
}
_time_counter += usecond()- inner_time_counter; _time_counter += usecond()- inner_time_counter;
} }

View File

@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP) #if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { template<class vobj>
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
commVector<vobj> reduction_buffer(rd*subvol_size); deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
vobj zero_init; vobj zero_init;
zeroit(zero_init); zeroit(zero_init);
@@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device //copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy //sync after copy
accelerator_barrier(); accelerator_barrier();
@@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
#if defined(GRID_SYCL) #if defined(GRID_SYCL)
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj>
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
@@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
mysum[r] = vobj_zero; mysum[r] = vobj_zero;
} }
commVector<vobj> reduction_buffer(rd*subvol_size); deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
@@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size}, cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](cl::sycl::id<1> item, auto &sum) { [=](sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });
@@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
} }
#endif #endif
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { template<class vobj>
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector; typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2; const int osites = rd*e1*e2;
commVector<vector>buffer(osites); deviceVector<vector>buffer(osites);
vector *dat = (vector *)Data; vector *dat = (vector *)Data;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
Vector<vector> lvSum_small(rd); std::vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0]; vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) { for (int w = 0; w < words; w++) {
@@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r]; lvSum_ptr[w+words*r]=lvSum_small[r];
} }
}
} }
template<class vobj>
} inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{ {
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) { if constexpr (sizeof(vobj) <= 256) {
@@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
} }
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj>
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
@@ -208,15 +247,19 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
}); });
} }
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else #else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif #endif
} }

View File

@@ -981,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice; hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
} }
ddl++;
}
} }
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDimv,hcoor);
@@ -1003,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
assert(orthog<nh); assert(orthog<nh);
assert(orthog>=0); assert(orthog>=0);
assert(hg->_processors[orthog]==1); assert(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
@@ -1020,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl); Coordinate lcoor(nl);
Coordinate hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0;
hcoor[orthog] = slice; hcoor[orthog] = slice;
int ddl=0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
} }
} }
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDimv,hcoor);

View File

@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
* *
*/ */
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf, template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
Lattice<vobj> &lat, Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
}); });
} }
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf, template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
const Lattice<vobj> &lat, const Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -462,13 +462,19 @@ public:
int rNsimd = Nsimd / simd[dimension]; int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static cshiftVector<vobj> send_buf; static deviceVector<vobj> send_buf;
static cshiftVector<vobj> recv_buf; static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<CommsRequest_t> fwd_req; std::vector<MpiCommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req; std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -495,9 +501,16 @@ public:
t_gather+=usecond()-t; t_gather+=usecond()-t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req, grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
for ( int d=0;d < depth ; d ++ ) { for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
t_gather+= usecond() - t; t_gather+= usecond() - t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req, grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
@@ -533,6 +553,11 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(fwd_req); grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();
@@ -543,6 +568,11 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(bwd_req); grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();

View File

@@ -49,7 +49,7 @@ static constexpr int Tm = 7;
static constexpr int Nc=Config_Nc; static constexpr int Nc=Config_Nc;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=Config_Nd;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
static constexpr int Nds=8; // double stored gauge field static constexpr int Nds=8; // double stored gauge field
static constexpr int Ngp=2; // gparity index range static constexpr int Ngp=2; // gparity index range
@@ -75,6 +75,7 @@ static constexpr int InverseYes=1;
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
const int SpinorIndex = 2; const int SpinorIndex = 2;
const int PauliIndex = 2; //TensorLevel counts from the bottom!
template<typename T> struct isSpinor { template<typename T> struct isSpinor {
static constexpr bool value = (SpinorIndex==T::TensorLevel); static constexpr bool value = (SpinorIndex==T::TensorLevel);
}; };

View File

@@ -132,6 +132,10 @@ public:
template <class GaugeField > template <class GaugeField >
class EmptyAction : public Action <GaugeField> class EmptyAction : public Action <GaugeField>
{ {
using Action<GaugeField>::refresh;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative

View File

@@ -55,6 +55,11 @@ public:
RealD alpha; // Mobius scale RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant RealD k; // EOFA normalization constant
// Device resident
deviceVector<Coeff_t> d_shift_coefficients;
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
virtual void Instantiatable(void) = 0; virtual void Instantiatable(void) = 0;
// EOFA-specific operations // EOFA-specific operations
@@ -92,6 +97,11 @@ public:
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
d_shift_coefficients.resize(Ls);
d_MooeeInv_shift_lc.resize(Ls);
d_MooeeInv_shift_norm.resize(Ls);
}; };
}; };

View File

@@ -90,16 +90,16 @@ public:
void M5D(const FermionField &psi, void M5D(const FermionField &psi,
const FermionField &phi, const FermionField &phi,
FermionField &chi, FermionField &chi,
Vector<Coeff_t> &lower, std::vector<Coeff_t> &lower,
Vector<Coeff_t> &diag, std::vector<Coeff_t> &diag,
Vector<Coeff_t> &upper); std::vector<Coeff_t> &upper);
void M5Ddag(const FermionField &psi, void M5Ddag(const FermionField &psi,
const FermionField &phi, const FermionField &phi,
FermionField &chi, FermionField &chi,
Vector<Coeff_t> &lower, std::vector<Coeff_t> &lower,
Vector<Coeff_t> &diag, std::vector<Coeff_t> &diag,
Vector<Coeff_t> &upper); std::vector<Coeff_t> &upper);
virtual void Instantiatable(void)=0; virtual void Instantiatable(void)=0;
@@ -119,35 +119,51 @@ public:
RealD mass_plus, mass_minus; RealD mass_plus, mass_minus;
// Save arguments to SetCoefficientsInternal // Save arguments to SetCoefficientsInternal
Vector<Coeff_t> _gamma; std::vector<Coeff_t> _gamma;
RealD _zolo_hi; RealD _zolo_hi;
RealD _b; RealD _b;
RealD _c; RealD _c;
// possible boost
std::vector<ComplexD> qmu;
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
// Cayley form Moebius (tanh and zolotarev) // Cayley form Moebius (tanh and zolotarev)
Vector<Coeff_t> omega; std::vector<Coeff_t> omega;
Vector<Coeff_t> bs; // S dependent coeffs std::vector<Coeff_t> bs; // S dependent coeffs
Vector<Coeff_t> cs; std::vector<Coeff_t> cs;
Vector<Coeff_t> as; std::vector<Coeff_t> as;
// For preconditioning Cayley form // For preconditioning Cayley form
Vector<Coeff_t> bee; std::vector<Coeff_t> bee;
Vector<Coeff_t> cee; std::vector<Coeff_t> cee;
Vector<Coeff_t> aee; std::vector<Coeff_t> aee;
Vector<Coeff_t> beo; std::vector<Coeff_t> beo;
Vector<Coeff_t> ceo; std::vector<Coeff_t> ceo;
Vector<Coeff_t> aeo; std::vector<Coeff_t> aeo;
// LDU factorisation of the eeoo matrix // LDU factorisation of the eeoo matrix
Vector<Coeff_t> lee; std::vector<Coeff_t> lee;
Vector<Coeff_t> leem; std::vector<Coeff_t> leem;
Vector<Coeff_t> uee; std::vector<Coeff_t> uee;
Vector<Coeff_t> ueem; std::vector<Coeff_t> ueem;
Vector<Coeff_t> dee; std::vector<Coeff_t> dee;
// Device memory
deviceVector<Coeff_t> d_diag;
deviceVector<Coeff_t> d_upper;
deviceVector<Coeff_t> d_lower;
deviceVector<Coeff_t> d_lee;
deviceVector<Coeff_t> d_dee;
deviceVector<Coeff_t> d_uee;
deviceVector<Coeff_t> d_leem;
deviceVector<Coeff_t> d_ueem;
// Matrices of 5d ee inverse params // Matrices of 5d ee inverse params
Vector<iSinglet<Simd> > MatpInv; // std::vector<iSinglet<Simd> > MatpInv;
Vector<iSinglet<Simd> > MatmInv; // std::vector<iSinglet<Simd> > MatmInv;
Vector<iSinglet<Simd> > MatpInvDag; // std::vector<iSinglet<Simd> > MatpInvDag;
Vector<iSinglet<Simd> > MatmInvDag; // std::vector<iSinglet<Simd> > MatmInvDag;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Conserved current utilities // Conserved current utilities
@@ -187,7 +203,7 @@ public:
protected: protected:
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c); virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -0,0 +1,196 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Copyright (C) 2020 - 2025
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion5D<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
int n_rhs;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@@ -60,6 +60,50 @@ public:
// virtual void Instantiatable(void)=0; // virtual void Instantiatable(void)=0;
virtual void Instantiatable(void) =0; virtual void Instantiatable(void) =0;
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
// Efficient support for multigrid coarsening // Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out); virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
@@ -90,12 +134,12 @@ protected:
RealD mass; RealD mass;
RealD R; RealD R;
RealD ZoloHiInv; RealD ZoloHiInv;
Vector<double> Beta; std::vector<double> Beta;
Vector<double> cc;; std::vector<double> cc;;
Vector<double> cc_d;; std::vector<double> cc_d;;
Vector<double> sqrt_cc; std::vector<double> sqrt_cc;
Vector<double> See; std::vector<double> See;
Vector<double> Aee; std::vector<double> Aee;
}; };

View File

@@ -69,10 +69,10 @@ public:
// Instantiate different versions depending on Impl // Instantiate different versions depending on Impl
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
virtual void RefreshShiftCoefficients(RealD new_shift); virtual void RefreshShiftCoefficients(RealD new_shift);
@@ -83,7 +83,7 @@ public:
RealD _M5, const ImplParams& p=ImplParams()); RealD _M5, const ImplParams& p=ImplParams());
protected: protected:
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c); void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -123,10 +123,10 @@ public:
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu_v, lcoor); peekLocalSite(ScalarUmu, Umu_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj_v, lcoor); peekLocalSite(ScalarUmu, Uadj_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds_v, lcoor); pokeLocalSite(ScalarUds, Uds_v, lcoor);
}); });

View File

@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
NAMESPACE_CHECK(WilsonTM); NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions #include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover); NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D); NAMESPACE_CHECK(Wilson5D);
@@ -84,6 +85,15 @@ NAMESPACE_CHECK(DomainWall);
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
NAMESPACE_CHECK(Overlap); NAMESPACE_CHECK(Overlap);
///////////////////////////////////////////////////////////////////////////////
// Two spin wilson fermion based
///////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
NAMESPACE_CHECK(TwoSpinWilson);
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -164,12 +174,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
// Compact Clover fermions // Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>; template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>; template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2; typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF; typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD; typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2; typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF; typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD; typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;

View File

@@ -43,6 +43,7 @@ NAMESPACE_CHECK(FermionOperatorImpl);
NAMESPACE_CHECK(FermionOperator); NAMESPACE_CHECK(FermionOperator);
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions #include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions #include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h> //used for 3D fermions, pauli in place of Dirac
NAMESPACE_CHECK(Kernels); NAMESPACE_CHECK(Kernels);
#endif #endif

View File

@@ -180,6 +180,12 @@ NAMESPACE_CHECK(ImplGparityWilson);
#include <Grid/qcd/action/fermion/StaggeredImpl.h> #include <Grid/qcd/action/fermion/StaggeredImpl.h>
NAMESPACE_CHECK(ImplStaggered); NAMESPACE_CHECK(ImplStaggered);
/////////////////////////////////////////////////////////////////////////////
// Two component spinor Wilson action for 3d / Boston
/////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h>
NAMESPACE_CHECK(ImplTwoSpinWilson);
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec // Single flavour one component spinors with colour index. 5d vec
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////

View File

@@ -274,7 +274,7 @@ public:
autoView( Uds_v , Uds, CpuWrite); autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite); autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{ thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss](); Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
}); });
} }
Utmp = Uconj; Utmp = Uconj;
@@ -286,7 +286,7 @@ public:
autoView( Uds_v , Uds, CpuWrite); autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite); autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{ thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss](); Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
}); });
} }
} }
@@ -320,7 +320,7 @@ public:
} }
Uconj = conjugate(*Upoke); Uconj = conjugate(*Upoke);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4); pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
} }
} }

View File

@@ -36,6 +36,8 @@ public:
static const std::vector<int> directions; static const std::vector<int> directions;
static const std::vector<int> displacements; static const std::vector<int> displacements;
static const int npoint = 16; static const int npoint = 16;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
}; };
template <class Impl> template <class Impl>
@@ -102,11 +104,11 @@ public:
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &B, int dag); const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -164,8 +166,6 @@ public:
DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd; DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Conserved current utilities // Conserved current utilities

View File

@@ -40,6 +40,8 @@ public:
static const std::vector<int> directions; static const std::vector<int> directions;
static const std::vector<int> displacements; static const std::vector<int> displacements;
const int npoint = 16; const int npoint = 16;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
}; };
template<class Impl> template<class Impl>
@@ -100,7 +102,6 @@ public:
int dag); int dag);
void DhopInternal(StencilImpl & st, void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
DoubledGaugeField &UUU, DoubledGaugeField &UUU,
const FermionField &in, const FermionField &in,
@@ -108,7 +109,6 @@ public:
int dag); int dag);
void DhopInternalOverlappedComms(StencilImpl & st, void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
DoubledGaugeField &UUU, DoubledGaugeField &UUU,
const FermionField &in, const FermionField &in,
@@ -116,7 +116,6 @@ public:
int dag); int dag);
void DhopInternalSerialComms(StencilImpl & st, void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
DoubledGaugeField &UUU, DoubledGaugeField &UUU,
const FermionField &in, const FermionField &in,
@@ -192,8 +191,6 @@ public:
DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd; DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; // std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@@ -42,11 +42,11 @@ public:
public: public:
// Shift operator coefficients for red-black preconditioned Mobius EOFA // Shift operator coefficients for red-black preconditioned Mobius EOFA
Vector<Coeff_t> Mooee_shift; std::vector<Coeff_t> Mooee_shift;
Vector<Coeff_t> MooeeInv_shift_lc; std::vector<Coeff_t> MooeeInv_shift_lc;
Vector<Coeff_t> MooeeInv_shift_norm; std::vector<Coeff_t> MooeeInv_shift_norm;
Vector<Coeff_t> MooeeInvDag_shift_lc; std::vector<Coeff_t> MooeeInvDag_shift_lc;
Vector<Coeff_t> MooeeInvDag_shift_norm; std::vector<Coeff_t> MooeeInvDag_shift_norm;
virtual void Instantiatable(void) {}; virtual void Instantiatable(void) {};
@@ -74,18 +74,18 @@ public:
// Instantiate different versions depending on Impl // Instantiate different versions depending on Impl
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
Vector<Coeff_t>& shift_coeffs); std::vector<Coeff_t>& shift_coeffs);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
Vector<Coeff_t>& shift_coeffs); std::vector<Coeff_t>& shift_coeffs);
virtual void RefreshShiftCoefficients(RealD new_shift); virtual void RefreshShiftCoefficients(RealD new_shift);

View File

@@ -36,6 +36,8 @@ public:
static const std::vector<int> directions; static const std::vector<int> directions;
static const std::vector<int> displacements; static const std::vector<int> displacements;
static const int npoint = 8; static const int npoint = 8;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
}; };
template <class Impl> template <class Impl>
@@ -102,11 +104,11 @@ public:
GaugeField &mat, GaugeField &mat,
const FermionField &A, const FermionField &B, int dag); const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -152,9 +154,6 @@ public:
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Conserved current utilities // Conserved current utilities
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@@ -41,6 +41,10 @@ public:
public: public:
// Constructors // Constructors
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@@ -41,6 +41,9 @@ public:
public: public:
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonContFracTanhFermion(GaugeField &_Umu, OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@@ -40,6 +40,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@@ -41,6 +41,9 @@ public:
public: public:
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@@ -40,6 +40,11 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
const int part_frac_chroma_convention=1; const int part_frac_chroma_convention=0;
void Meooe_internal(const FermionField &in, FermionField &out,int dag); void Meooe_internal(const FermionField &in, FermionField &out,int dag);
void Mooee_internal(const FermionField &in, FermionField &out,int dag); void Mooee_internal(const FermionField &in, FermionField &out,int dag);
@@ -83,19 +83,78 @@ public:
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,const ImplParams &p= ImplParams()); RealD _mass,RealD M5,const ImplParams &p= ImplParams());
PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
if ( this->qmu.size() ){
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
} else {
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
}
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
protected: protected:
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata); virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
std::vector<RealD> qmu;
// Part frac // Part frac
RealD mass; RealD mass;
RealD dw_diag; RealD dw_diag;
RealD R; RealD R;
RealD amax; RealD amax;
RealD scale; RealD scale;
Vector<double> p; std::vector<double> p;
Vector<double> q; std::vector<double> q;
}; };

View File

@@ -35,7 +35,7 @@ template<class Matrix, class Field>
class KappaSimilarityTransform { class KappaSimilarityTransform {
public: public:
INHERIT_IMPL_TYPES(Matrix); INHERIT_IMPL_TYPES(Matrix);
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag; std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
KappaSimilarityTransform (Matrix &zmob) { KappaSimilarityTransform (Matrix &zmob) {
for (int i=0;i<(int)zmob.bs.size();i++) { for (int i=0;i<(int)zmob.bs.size();i++) {

View File

@@ -141,9 +141,9 @@ public:
Udag = Udag *phases; Udag = Udag *phases;
InsertGaugeField(Uds,U,mu); InsertGaugeField(Uds,U,mu);
InsertGaugeField(Uds,Udag,mu+4); InsertGaugeField(Uds,Udag,mu+Nd);
// PokeIndex<LorentzIndex>(Uds, U, mu); // PokeIndex<LorentzIndex>(Uds, U, mu);
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4); // PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);
// 3 hop based on thin links. Crazy huh ? // 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu); U = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -156,7 +156,7 @@ public:
UUUdag = UUUdag *phases; UUUdag = UUUdag *phases;
InsertGaugeField(UUUds,UUU,mu); InsertGaugeField(UUUds,UUU,mu);
InsertGaugeField(UUUds,UUUdag,mu+4); InsertGaugeField(UUUds,UUUdag,mu+Nd);
} }
} }

View File

@@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
public: public:
void DhopImproved(StencilImpl &st, LebesgueOrder &lo, void DhopImproved(StencilImpl &st,
DoubledGaugeField &U, DoubledGaugeField &UUU, DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior); const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo, void DhopNaive(StencilImpl &st,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior); const FermionField &in, FermionField &out, int dag, int interior,int exterior);

View File

@@ -0,0 +1,175 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma one
NAMESPACE_BEGIN(Grid);
class TwoSpinWilsonFermion3plus1DStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static const std::vector<int> directions;
static const std::vector<int> displacements;
static constexpr int npoint = 6;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
};
template<class Impl>
class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
{
public:
INHERIT_IMPL_TYPES(Impl);
typedef TwoSpinWilsonKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
int Dirichlet;
Coordinate Block;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _ThreeDimGrid ;}
GridBase *GaugeRedBlackGrid(void) { return _ThreeDimRedBlackGrid ;}
GridBase *FermionGrid(void) { return _FourDimGrid;}
GridBase *FermionRedBlackGrid(void) { return _FourDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now
virtual void M (const FermionField &in, FermionField &out){assert(0);};
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
// These can be overridden by fancy 5d chiral action
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
// void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
// Implement hopping term non-hermitian hopping term; half cb or both
// Implement s-diagonal DW
void DW (const FermionField &in, FermionField &out,int dag);
void Dhop (const FermionField &in, FermionField &out,int dag);
void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag);
void DhopComms (const FermionField &in, FermionField &out);
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
// add a DhopComm
// -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
void DhopDirComms(const FermionField &in);
void DhopDirCalc(const FermionField &in, FermionField &out,int point);
///////////////////////////////////////////////////////////////
// New methods added
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(StencilImpl & st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
GridCartesian &ThreeDimGrid,
GridRedBlackCartesian &ThreeDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams());
virtual void DirichletBlock(const Coordinate & block)
{
}
// DoubleStore
void ImportGauge(const GaugeField &_Umu);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
// Add these to the support from Wilson
GridBase *_ThreeDimGrid;
GridBase *_ThreeDimRedBlackGrid;
GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid;
double M5;
int Ls;
//Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,222 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index
/////////////////////////////////////////////////////////////////////////////
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public:
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
//Necessary?
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
typedef typename Options::_Coeff_t Coeff_t;
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplPropagator<Simd> SitePropagator;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef iImplHalfCommSpinor<Simd> SiteHalfCommSpinor;
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
typedef Lattice<SiteSpinor> FermionField;
typedef Lattice<SitePropagator> PropagatorField;
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef SimpleCompressor<SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
typedef const typename StencilImpl::View_type StencilView;
ImplParams Params;
TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
};
template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U,
const _Spinor &chi,
int mu)
{
auto UU = coalescedRead(U(mu));
mult(&phi(), &UU, &chi());
}
template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U,
const _Spinor &chi,
int mu,
StencilEntry *SE,
StencilView &St)
{
multLink(phi,U,chi,mu);
}
template<class _SpinorField>
inline void multLinkField(_SpinorField & out,
const DoubledGaugeField &Umu,
const _SpinorField & phi,
int mu)
{
const int Nsimd = SiteHalfSpinor::Nsimd();
autoView( out_v, out, AcceleratorWrite);
autoView( phi_v, phi, AcceleratorRead);
autoView( Umu_v, Umu, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
coalescedWrite(out_v[sss],tmp);
});
}
template <class ref>
static accelerator_inline void loadLinkElement(Simd &reg, ref &memory)
{
reg = memory;
}
inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &Uds,
const GaugeField &Umu)
{
typedef typename Simd::scalar_type scalar_type;
conformable(Uds.Grid(), GaugeGrid);
conformable(Umu.Grid(), GaugeGrid);
GaugeLinkField U(GaugeGrid);
GaugeLinkField tmp(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid);
////////////////////////////////////////////////////
// apply any boundary phase or twists
////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
////////// boundary phase /////////////
auto pha = Params.boundary_phases[mu];
scalar_type phase( real(pha),imag(pha) );
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu);
// apply any twists
RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
if ( theta != 0.0) {
scalar_type twphase(::cos(theta),::sin(theta));
U = twphase*U;
std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
}
tmp = where(coor == Lmu, phase * U, U);
PokeIndex<LorentzIndex>(Uds, tmp, mu);
U = adj(Cshift(U, mu, -1));
U = where(coor == 0, conjugate(phase) * U, U);
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
GaugeLinkField link(mat.Grid());
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
PokeIndex<LorentzIndex>(mat,link,mu);
}
inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
mat = outerProduct(B,A);
}
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
mat = TraceIndex<SpinIndex>(P);
}
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
{
for (int mu = 0; mu < Nd; mu++)
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
{
int Ls=Btilde.Grid()->_fdimensions[0];
autoView( mat_v , mat, AcceleratorWrite);
{
const int Nsimd = SiteSpinor::Nsimd();
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
auto op = outerProduct(bb,aa);
sum = sum + op;
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
}
};
typedef TwoSpinWilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR; // Real.. whichever prec
typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF; // Float
typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD; // Double
typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2; // Double
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,84 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper routines that implement Wilson stencil for a single site.
// Common to both the WilsonFermion and WilsonFermion5D
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public:
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids);
static void DhopDagKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;
static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
private:
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
public:
TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
};
NAMESPACE_END(Grid);

View File

@@ -47,7 +47,7 @@ public:
static int PartialCompressionFactor(GridBase *grid) { return 1;} static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif #endif
template<class vobj,class cobj,class compressor> template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table, static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs, const Lattice<vobj> &rhs,
cobj *buffer, cobj *buffer,
compressor &compress, compressor &compress,
@@ -109,7 +109,7 @@ public:
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor> template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask, std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial) compressor &compress,int type,int partial)
{ {
@@ -197,7 +197,7 @@ public:
#endif #endif
template<class vobj,class cobj,class compressor> template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table, static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs, const Lattice<vobj> &rhs,
cobj *buffer, cobj *buffer,
compressor &compress, compressor &compress,
@@ -208,7 +208,7 @@ public:
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
} }
template<class vobj,class cobj,class compressor> template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask, std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial) compressor &compress,int type,int partial)
{ {
@@ -402,7 +402,6 @@ public:
typedef CartesianStencil<vobj,cobj,Parameters> Base; typedef CartesianStencil<vobj,cobj,Parameters> Base;
typedef typename Base::View_type View_type; typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector;
// Vector<int> surface_list; // Vector<int> surface_list;
WilsonStencil(GridBase *grid, WilsonStencil(GridBase *grid,
@@ -416,29 +415,6 @@ public:
this->same_node.resize(npoints); this->same_node.resize(npoints);
}; };
/*
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
this->same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
*/
template < class compressor> template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
{ {
@@ -508,6 +484,11 @@ public:
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier(); accelerator_barrier();
#ifdef NVLINK_GET
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
} }
}; };

View File

@@ -38,6 +38,8 @@ public:
static int MortonOrder; static int MortonOrder;
static const std::vector<int> directions; static const std::vector<int> directions;
static const std::vector<int> displacements; static const std::vector<int> displacements;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
static const int npoint = 8; static const int npoint = 8;
}; };
@@ -126,13 +128,16 @@ public:
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
const FermionField &A, const FermionField &B, int dag); const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternal(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternalSerial(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, void DhopInternalOverlappedComms(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
// Constructor // Constructor
@@ -168,9 +173,6 @@ public:
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
WilsonAnisotropyCoefficients anisotropyCoeff; WilsonAnisotropyCoefficients anisotropyCoeff;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@@ -62,6 +62,8 @@ public:
static const std::vector<int> directions; static const std::vector<int> directions;
static const std::vector<int> displacements; static const std::vector<int> displacements;
static constexpr int npoint = 8; static constexpr int npoint = 8;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
}; };
template<class Impl> template<class Impl>
@@ -91,13 +93,13 @@ public:
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);}; virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now // half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);}; virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);}; virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
@@ -109,6 +111,8 @@ public:
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
std::vector<double> qmu) ;
// Implement hopping term non-hermitian hopping term; half cb or both // Implement hopping term non-hermitian hopping term; half cb or both
// Implement s-diagonal DW // Implement s-diagonal DW
@@ -117,6 +121,9 @@ public:
void DhopOE(const FermionField &in, FermionField &out,int dag); void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag); void DhopEO(const FermionField &in, FermionField &out,int dag);
void DhopComms (const FermionField &in, FermionField &out);
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
// add a DhopComm // add a DhopComm
// -- suboptimal interface will presently trigger multiple comms. // -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
@@ -135,21 +142,18 @@ public:
int dag); int dag);
void DhopInternal(StencilImpl & st, void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, FermionField &out,
int dag); int dag);
void DhopInternalOverlappedComms(StencilImpl & st, void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, FermionField &out,
int dag); int dag);
void DhopInternalSerialComms(StencilImpl & st, void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, const FermionField &in,
FermionField &out, FermionField &out,
@@ -203,9 +207,6 @@ public:
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; // std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@@ -166,7 +166,7 @@ public:
U = adj(Cshift(U, mu, -1)); U = adj(Cshift(U, mu, -1));
U = where(coor == 0, conjugate(phase) * U, U); U = where(coor == 0, conjugate(phase) * U, U);
PokeIndex<LorentzIndex>(Uds, U, mu + 4); PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
} }
} }

View File

@@ -57,6 +57,10 @@ public:
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ; int interior=1,int exterior=1) ;
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids);
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ; int interior=1,int exterior=1) ;

View File

@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
Frbgrid, Frbgrid,
Ugrid, Ugrid,
Urbgrid, Urbgrid,
4.0,p) Nd*1.0,p)
{ {
update(_mass,_mu); update(_mass,_mu);
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
//axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
for (int s=0;s<(int)this->mass.size();s++) { for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s]; ComplexD a = Nd*1.0+this->mass[s];
ComplexD b(0.0,this->mu[s]); ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(out,a,in,b,in,s,s); axpbg5y_ssp(out,a,in,b,in,s,s);
} }
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
virtual void MooeeDag(const FermionField &in, FermionField &out) { virtual void MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
for (int s=0;s<(int)this->mass.size();s++) { for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s]; ComplexD a = Nd*1.0+this->mass[s];
ComplexD b(0.0,-this->mu[s]); ComplexD b(0.0,-this->mu[s]);
axpbg5y_ssp(out,a,in,b,in,s,s); axpbg5y_ssp(out,a,in,b,in,s,s);
} }
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
for (int s=0;s<(int)this->mass.size();s++) { for (int s=0;s<(int)this->mass.size();s++) {
RealD m = this->mass[s]; RealD m = this->mass[s];
RealD tm = this->mu[s]; RealD tm = this->mu[s];
RealD mtil = 4.0+this->mass[s]; RealD mtil = Nd*1.0+this->mass[s];
RealD sq = mtil*mtil+tm*tm; RealD sq = mtil*mtil+tm*tm;
ComplexD a = mtil/sq; ComplexD a = mtil/sq;
ComplexD b(0.0, -tm /sq); ComplexD b(0.0, -tm /sq);
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
for (int s=0;s<(int)this->mass.size();s++) { for (int s=0;s<(int)this->mass.size();s++) {
RealD m = this->mass[s]; RealD m = this->mass[s];
RealD tm = this->mu[s]; RealD tm = this->mu[s];
RealD mtil = 4.0+this->mass[s]; RealD mtil = Nd*1.0+this->mass[s];
RealD sq = mtil*mtil+tm*tm; RealD sq = mtil*mtil+tm*tm;
ComplexD a = mtil/sq; ComplexD a = mtil/sq;
ComplexD b(0.0,tm /sq); ComplexD b(0.0,tm /sq);
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
this->Dhop(in, out, DaggerNo); this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid()); FermionField tmp(out.Grid());
for (int s=0;s<(int)this->mass.size();s++) { for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s]; ComplexD a = Nd*1.0+this->mass[s];
ComplexD b(0.0,this->mu[s]); ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(tmp,a,in,b,in,s,s); axpbg5y_ssp(tmp,a,in,b,in,s,s);
} }

Some files were not shown because too many files have changed in this diff Show More