1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-24 17:54:47 +01:00

Compare commits

..

1350 Commits

Author SHA1 Message Date
Chulwoo Jung
7780d88d26 Adding simple lanczos, boundary to specflow(!) 2025-08-06 23:41:53 +00:00
Chulwoo Jung
2bf9179d2c Adding mass step 2025-08-06 16:52:51 +00:00
Chulwoo Jung
c606f5dca0 Move out src initialization for re-use / Adding antiperiodic BC 2025-08-06 16:51:14 +00:00
Chulwoo Jung
8419cc5c64 specflow evec I/O added, 2025-07-11 15:57:23 -04:00
Chulwoo Jung
2cc6deb8e0 Merge branch 'develop' of https://github.com/paboyle/Grid into ic2 2025-04-25 10:48:41 -04:00
Chulwoo Jung
19d0590579 Checking in for merging 2025-04-25 10:48:22 -04:00
Peter Boyle
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
Peter Boyle
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
Peter Boyle
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
Peter Boyle
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
Peter Boyle
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
Peter Boyle
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
Peter Boyle
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
Peter Boyle
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
Peter Boyle
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
Peter Boyle
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
Peter Boyle
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
Peter Boyle
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
Peter Boyle
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
Chulwoo Jung
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
Chulwoo Jung
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
Christoph Lehner
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
Christoph Lehner
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
Peter Boyle
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
Peter Boyle
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
Peter Boyle
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
Peter Boyle
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
Peter Boyle
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
Peter Boyle
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
Peter Boyle
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
Peter Boyle
3811d19298 Fence 2025-04-04 18:35:06 -04:00
Peter Boyle
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
Peter Boyle
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
Peter Boyle
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
Peter Boyle
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
Peter Boyle
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
Peter Boyle
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
Peter Boyle
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
Peter Boyle
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
Mashy Green
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
Mashy Green
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
Peter Boyle
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
Christoph Lehner
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
Christoph Lehner
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
Christoph Lehner
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
Mashy Green
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
Mashy Green
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
Mashy Green
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
Christoph Lehner
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
Peter Boyle
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
Peter Boyle
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
Peter Boyle
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
Peter Boyle
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
Peter Boyle
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
Peter Boyle
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
Peter Boyle
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
Peter Boyle
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
Peter Boyle
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
Peter Boyle
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
Muhammad Asif
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
Muhammad Asif
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
Mashy Green
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
Mashy Green
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
Mashy Green
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
Mashy Green
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
Mashy Green
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
Mashy Green
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
Peter Boyle
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
Christoph Lehner
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
Mashy Green
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
Mashy Green
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
Chulwoo Jung
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
Chulwoo Jung
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
Peter Boyle
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
Chulwoo Jung
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
Chulwoo Jung
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
Mashy Green
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
Mashy Green
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
Mashy Green
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
Mashy Green
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
Peter Boyle
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
Peter Boyle
b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
Peter Boyle
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
Peter Boyle
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
Peter Boyle
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
Peter Boyle
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
Peter Boyle
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
Peter Boyle
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
Peter Boyle
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
Peter Boyle
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
Peter Boyle
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
Peter Boyle
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
Peter Boyle
5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
Peter Boyle
4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
Peter Boyle
955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
Peter Boyle
11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
Peter Boyle
8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
Peter Boyle
ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
Peter Boyle
6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
Peter Boyle
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
Peter Boyle
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
Peter Boyle
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
Peter Boyle
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
Peter Boyle
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
Peter Boyle
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
Peter Boyle
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
Peter Boyle
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
066544281f Deprecate UVM 2024-09-17 13:34:27 +00:00
11be10d2c0 Aurora testing 2024-09-10 18:11:52 +00:00
160969a758 UVM tester, doesn't turn up anything 2024-09-10 18:09:42 +00:00
622f78ebea SYCL updates -- operator = giving trouble on Aurora.
SYCL reduction is failing intermittently with SVM interface - returns
zero, expect non-zero.
Think I need to remove ALL dependence on SVM.
2024-09-04 13:53:48 +00:00
Peter Boyle
aa67a5b095 Rename 2024-08-27 19:54:01 +00:00
Peter Boyle
af9ea0864c Blas fix 2024-08-27 19:53:09 +00:00
Peter Boyle
4e2a6d87c4 Gemm batched fix 2024-08-27 19:24:05 +00:00
Peter Boyle
a465ecece9 Aurora 2024-08-27 19:20:43 +00:00
Peter Boyle
575eb72182 Converges on 16^3 2024-08-27 19:20:38 +00:00
Peter Boyle
3a973914d6 Compile on frontier 2024-08-27 14:55:42 -04:00
Peter Boyle
f568c07bbd Improved the BLAS benchmark 2024-08-27 14:53:54 -04:00
Peter Boyle
2c9878fc3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-27 12:05:46 -04:00
Peter Boyle
27b1b1b005 Checkerboard available for offloading pickCheckerboard 2024-08-27 12:04:09 -04:00
Peter Boyle
130d7ab077 Verbose changes 2024-08-27 12:03:28 -04:00
Peter Boyle
29f6b8a74a Setup 2024-08-27 12:02:49 -04:00
Peter Boyle
9779aaea33 16^3 optimise 2024-08-27 11:38:35 -04:00
Peter Boyle
ec25604a67 Fastest solver for mrhs multigrid 2024-08-27 11:32:34 -04:00
Peter Boyle
3668e81c5e Extract slice working on checkerboard field for Block Lanczos 2024-08-27 11:31:30 -04:00
Peter Boyle
d66b2423cb Move slice operations to GPU for BlockCG 2024-08-27 11:28:47 -04:00
Peter Boyle
15cc78f0b6 peek/poke local site on checkerboard arrays 2024-08-27 11:23:42 -04:00
Peter Boyle
06db4ddea2 Fast init on GPU 2024-08-27 11:22:33 -04:00
Peter Boyle
6cfb90e99f Support needed for accelerator resident set/pick Checkerboard 2024-08-27 11:19:00 -04:00
Peter Boyle
d8be95a2a3 Don't early terminate power method to get more accurate top EV 2024-08-27 11:17:37 -04:00
Peter Boyle
f82702872d Normal residual 2024-08-27 11:16:44 -04:00
Peter Boyle
3752c49ef0 Add option to record the CG polynomial 2024-08-27 11:14:35 -04:00
Peter Boyle
fe65fa4988 MulMatrix 2024-08-27 11:13:18 -04:00
Peter Boyle
1fe4c205a3 Adef 2024-08-27 11:11:47 -04:00
Peter Boyle
d4dc5e0f43 BlockCG linalg acceleratoin with BLAS 2024-08-27 11:08:33 -04:00
Peter Boyle
77944437ce Functor initialisation 2024-08-27 11:01:02 -04:00
Peter Boyle
c164bff758 MMdag 2024-08-27 11:00:36 -04:00
Peter Boyle
aa2e3d954a MMdag operator 2024-08-27 10:59:29 -04:00
Peter Boyle
de62b04728 Block CG linalg acceleration 2024-08-27 10:58:54 -04:00
Peter Boyle
d0bdb50f24 Analyse power spectrum 2024-08-27 10:58:19 -04:00
Peter Boyle
a8fecbc609 BlockCG linalg via BLAS 2024-08-21 16:08:16 -04:00
Peter Boyle
557fa483ff Blas benchmark committed stand alone 2024-08-20 16:18:43 +00:00
Peter Boyle
fc15d55df6 Mallinfo 2024-08-20 14:33:09 +00:00
Peter Boyle
53573d7d94 Better benchmark 2024-08-20 14:31:57 +00:00
Peter Boyle
bb3c177000 Better benchmarking 2024-08-20 14:31:41 +00:00
Peter Boyle
a3322b470f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-20 14:30:52 +00:00
Peter Boyle
f8f408e7a9 BLAS everywhere 2024-07-25 18:09:02 +00:00
Peter Boyle
baac1127d0 Later intel compiler happiness 2024-07-25 18:06:05 +00:00
Peter Boyle
6f1328160c Remove SVM use 2024-07-25 18:05:40 +00:00
Peter Boyle
04cf902791 Mallinfo and ASAN hooks 2024-07-25 18:04:56 +00:00
Peter Boyle
7a5b1c1a19 Try Catch convenience macro 2024-07-25 18:03:41 +00:00
Peter Boyle
18d2d7da4a Eigen implementation and SYCL implementation 2024-07-25 18:02:56 +00:00
Peter Boyle
b461184797 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-23 09:53:58 -04:00
Peter Boyle
4563b39305 New Frontier config 2024-07-23 09:53:08 -04:00
Peter Boyle
c9d5674d5b FInal for paper 2024-07-22 15:26:45 -04:00
Peter Boyle
486412635a 8^4 test for PETSc 2024-07-22 15:25:17 -04:00
Peter Boyle
8b23a1546a Force compile temporarily 2024-07-22 15:24:56 -04:00
Peter Boyle
a901e4e369 Regressed performance for paper 2024-07-22 15:24:04 -04:00
Peter Boyle
804d9367d4 Regressed performance 2024-07-22 15:23:25 -04:00
Peter Boyle
41d8adca95 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-11 15:38:45 +00:00
Peter Boyle
059e8e5bb0 New compile option 2024-07-11 15:37:30 +00:00
Peter Boyle
b3ee8ded96 Respect command line 2024-07-11 15:34:48 +00:00
Peter Boyle
cf3584ad15 Convenient to monitor memory across an HMC trajectory 2024-07-11 15:30:32 +00:00
Peter Boyle
a66973163f Device vector not UVM 2024-07-11 15:24:11 +00:00
Peter Boyle
4502a8c8a1 libc malloc heap info dump on Linux 2024-07-11 15:22:18 +00:00
Peter Boyle
9c902e4c2d Batched blas, but not working yet on OneAPI 2024-07-11 15:19:49 +00:00
Peter Boyle
f3eb36adcf Namespace addition 2024-07-11 15:19:19 +00:00
Peter Boyle
7c246606c1 Schur additional case 2024-07-10 22:04:32 +00:00
Peter Boyle
172c75029e Redblack additional case 2024-07-10 22:03:59 +00:00
Peter Boyle
6ae52da571 LLVM leak sanitizer 2024-07-08 15:59:18 +00:00
Peter Boyle
4ee9c68053 Updated compile environment 2024-07-08 15:57:57 +00:00
Peter Boyle
a15b4378a3 Sanitizer preservation of options 2024-07-08 15:57:45 +00:00
Peter Boyle
89fdd7f8dd AOT compilation 2024-07-05 17:47:56 +00:00
Peter Boyle
c328be24b7 Sanitizer compile options 2024-07-05 17:46:43 +00:00
Peter Boyle
a73dc6dbf4 Display linux heap info 2024-06-28 16:05:17 +00:00
Peter Boyle
eee2a2657f Try catch exception wrappers 2024-06-28 16:02:29 +00:00
Peter Boyle
12b8be7cb9 Best so far on 96^3 350 Evecs converged on 4^4 block 2024-06-18 16:31:37 -04:00
Peter Boyle
63c223ea5d Verbose 2024-06-18 03:22:01 +00:00
Peter Boyle
2877fb4a2c More verbose if alloc failure 2024-06-18 03:21:03 +00:00
Peter Boyle
d299c86633 Std::asin,acos 2024-06-11 16:41:23 -04:00
Peter Boyle
6ce52092e8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 15:16:58 -04:00
Peter Boyle
b5926c1d21 Broadcast time info 2024-06-11 15:16:25 -04:00
Peter Boyle
9563238e9b Force initial to identity 2024-06-11 17:51:58 +00:00
Peter Boyle
fb9b1d76ca Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 16:48:16 +00:00
Peter Boyle
1739146599 Property to initialise reduction 2024-06-11 16:47:35 +00:00
Peter Boyle
ed20b39ab3 Log files from Frontier benchmark 2024-06-11 11:16:20 -04:00
Peter Boyle
284fc05f15 Protect vs. missing LIME libarary 2024-06-11 11:08:00 -04:00
Peter Boyle
07a07b6fa3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-10 15:09:25 -04:00
Peter Boyle
dc80b08969 96^3 test 2024-06-10 15:07:29 -04:00
Peter Boyle
a49a161f8d SYCL update to use buffer on reduction variable 2024-06-08 16:05:18 +00:00
Peter Boyle
a6479ca50f Shuhei's ComputeWilsonFlow main programme 2024-06-05 15:51:11 -04:00
Peter Boyle
0e607a55e7 Updated for 8^4 test 2024-05-26 20:53:05 +00:00
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
Peter Boyle
c4b9f71357 CPU compile ordering is important 2024-05-21 02:22:32 +01:00
Peter Boyle
394e506aea Compile options for tursa update 2024-05-21 02:10:04 +01:00
Peter Boyle
e19b26341b Tursa configure update 2024-05-21 01:14:27 +01:00
Peter Boyle
cfe1b13225 Back out zero change 2024-05-21 01:14:08 +01:00
Peter Boyle
890c5ea1cd Warning disable 2024-05-20 20:08:31 +01:00
Peter Boyle
a87378d3b6 Update 2024-05-20 20:08:31 +01:00
Peter Boyle
832fc08809 Merge pull request #459 from dbollweg/sycl_slicesum_update
Sycl slicesum bugfix
2024-05-20 15:06:53 -04:00
Peter Boyle
9a1ad6a5eb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-05-17 11:33:46 -04:00
Peter Boyle
a90eafad24 Merge branch 'feature/scidac-wp1' into develop 2024-05-17 11:32:00 -04:00
Peter Boyle
ad14a82742 Working aas good as possible on 48^3 in double 2024-05-16 10:55:45 -04:00
Peter Boyle
14e9d8ed9f CG improvements for smoother 2024-05-16 10:55:18 -04:00
Peter Boyle
0ac85fa70b Serialisation removal 2024-05-16 10:49:04 -04:00
Peter Boyle
c371de42b9 Some site tools for sitewise autocorr 2024-05-16 10:48:23 -04:00
Peter Boyle
ccf147d6c1 Select the compiler that gives better performance on sunspot 2024-05-07 18:45:56 +00:00
Peter Boyle
7aa12b446f New config command for sunspot 2024-05-07 18:45:40 +00:00
Peter Boyle
c293228102 layout control 2024-05-07 18:45:21 +00:00
Peter Boyle
5c4c9f721a Remove pbs file and replace with bench1 and bench2 for 1 and 2 nodes 2024-05-07 18:44:49 +00:00
Peter Boyle
057f86c1de 2 queues works ok in performance 2024-05-07 18:42:50 +00:00
Peter Boyle
cd52e3cbc2 Jobs on subspot 2024-05-07 18:38:15 +00:00
Peter Boyle
24602e1259 Accidental synchronise 2024-05-07 17:28:38 +00:00
Peter Boyle
8a098889fc Update FlightRecorder.cc 2024-04-30 21:15:08 +01:00
Peter Boyle
5c3ace7c3e Merge branch 'develop' into feature/scidac-wp1 2024-04-30 05:26:06 -04:00
Peter Boyle
aa148455b7 Updated todo list 2024-04-30 05:24:39 -04:00
Peter Boyle
98cf247f33 prepare to switch to mixed precision 2024-04-30 05:23:45 -04:00
Peter Boyle
0cf16522d1 Refine with HDCG choice 2024-04-30 05:22:14 -04:00
Peter Boyle
7b7c75f9e5 Setup 2024-04-30 05:21:02 -04:00
Peter Boyle
aefd255a3c Verbose 2024-04-30 05:20:41 -04:00
Peter Boyle
1c5aa939fd Subspace setup changes 2024-04-30 05:19:09 -04:00
Peter Boyle
3a0ff17be0 Verbose changes 2024-04-30 05:17:28 -04:00
Peter Boyle
47829ae5cc Verbose changes 2024-04-30 05:16:46 -04:00
Peter Boyle
bfa7b69aff Verbose changes 2024-04-16 15:42:46 -04:00
Peter Boyle
2aaa959b5f Printing changes 2024-04-16 15:41:25 -04:00
Peter Boyle
ce2970b93a Printing changes 2024-04-16 15:40:38 -04:00
Peter Boyle
7b76970d10 Verbose changes 2024-04-16 15:40:10 -04:00
Peter Boyle
9fd41882d2 Herm Op update 2024-04-16 15:39:27 -04:00
Peter Boyle
ff2ea5de18 Update Tensor_traits.h 2024-04-11 14:25:45 -04:00
Peter Boyle
5147a42818 Updated hdcg 2024-04-05 01:05:57 -04:00
Peter Boyle
57552d8ca3 Assign from non-lattice made accelerator resident 2024-04-05 01:05:12 -04:00
Peter Boyle
13713b2a76 Much faster little dirac operator calculation 2024-04-05 01:04:40 -04:00
Peter Boyle
36a14e4ee3 Best setup and introduce an HDCG refine method 2024-04-05 01:03:33 -04:00
Peter Boyle
b4cc788b8c First version used in mrhsHDCG
Need to consolidate files.
Plan: Make this version able to go virtual base, then absorb chulwoos
version when it is proven
2024-04-05 01:02:21 -04:00
Peter Boyle
0f0e7512f3 Keep MRHS in a different file 2024-04-05 00:59:53 -04:00
Peter Boyle
1196b1a161 Less verbose 2024-04-05 00:58:58 -04:00
Peter Boyle
2c8c3be9ee Adef2Mrhs 2024-04-05 00:57:13 -04:00
Peter Boyle
5b79d51c22 Improvements 2024-04-01 14:18:40 -04:00
Peter Boyle
da890dc293 Verbose changes 2024-04-01 14:18:00 -04:00
Peter Boyle
93d0a1e73a HISQ view call 2024-04-01 14:16:47 -04:00
Peter Boyle
f0a8c7d045 Playing with chebyshevs 2024-04-01 14:16:11 -04:00
Peter Boyle
db8793777c Logging/verbose 2024-04-01 14:15:41 -04:00
Peter Boyle
c745484e65 9.5x speed up version 2024-04-01 14:14:30 -04:00
Peter Boyle
da59379612 Large reg file for double 2024-03-26 17:03:20 +00:00
Peter Boyle
3ef2a41518 ifdef guard ommitted 2024-03-26 14:50:32 +00:00
Peter Boyle
aa96f420c6 Acclerator ware MPI guard on the Unix domain sockets 2024-03-26 14:41:25 +00:00
Peter Boyle
49e9e4ed0e Fences 2024-03-26 14:14:06 +00:00
Peter Boyle
f7b8163016 Deterministic MPI reduce options 2024-03-26 14:11:40 +00:00
Peter Boyle
93769eacd3 Updated configure for bounce through host 2024-03-26 14:10:24 +00:00
Peter Boyle
59b0cc11df REduce the time in single 2024-03-26 00:42:40 +00:00
Peter Boyle
f32c275376 Updated config options for MPI not being aware of GPU 2024-03-26 00:42:00 +00:00
Peter Boyle
5404fc66ab Merge needs a fence on SYCL 2024-03-26 00:38:41 +00:00
Peter Boyle
1f53458af8 Options to bounce through a host buffer if
--disable-accelerator-aware-mpi
2024-03-26 00:37:19 +00:00
Peter Boyle
434c3e7f1d We have a choice of GET or PUT across NVlink 2024-03-25 14:32:44 +00:00
Peter Boyle
500b119f3d Deterministic MPI 2024-03-22 15:55:23 +00:00
Peter Boyle
4b87259c1b New config command for sunspot 2024-03-22 15:43:49 +00:00
Peter Boyle
503dec34ef This appears working now on Sunspot 2024-03-22 15:43:30 +00:00
Peter Boyle
d1e9fe50d2 Xor csum for repro testing 2024-03-22 15:42:57 +00:00
Peter Boyle
d01e5fa838 Improved FlightRecorder 2024-03-22 15:42:32 +00:00
Peter Boyle
a477c25e8c Sunspot repro tests 2024-03-22 15:42:11 +00:00
Peter Boyle
1bd20cd9e8 FlightRecorder 2024-03-22 15:40:01 +00:00
Peter Boyle
e49e95b037 Upgrade of the Britney test with flight recorder and fast xor checksum 2024-03-22 15:39:27 +00:00
Peter Boyle
6f59fed563 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:32 +00:00
Peter Boyle
60b7f6c99d Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:26 +00:00
Peter Boyle
b92dfcc8d3 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:27 +00:00
Peter Boyle
f6fd6dd053 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:01 +00:00
Peter Boyle
79ad567dd5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-19 15:43:42 +00:00
Peter Boyle
fab1efb48c More britney logging improvements 2024-03-19 14:36:21 +00:00
Peter Boyle
660eb76d93 FFTW from OneAPI 2024-03-19 14:28:33 +00:00
dbollweg
461cd045c6 sliceSum cleanup 2024-03-13 18:18:44 -04:00
dbollweg
fee65d7a75 Merge branch 'paboyle:develop' into sycl_slicesum_update 2024-03-13 18:06:17 -04:00
dbollweg
31f9971dbf avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum 2024-03-13 13:39:26 -04:00
Peter Boyle
62e7bf024a Updated flight logging for Britney test 2024-03-12 20:10:04 +00:00
Peter Boyle
95f3d69cf9 Extra hardware test hook 2024-03-12 20:09:37 +00:00
89c0519f83 Repro test 2024-03-12 16:11:33 +00:00
2704b82084 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-12 15:16:24 +00:00
cf8632bbac Britney test option 2024-03-12 15:15:35 +00:00
d224297972 PBS scripts 2024-03-12 15:15:16 +00:00
Peter Boyle
a4d11a630f Merge pull request #458 from paboyle/fix/HOST_NAME_MAX
fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined
2024-03-07 07:50:25 -05:00
2b4399f8b1 more HOST_NAME_MAX fix 2024-03-07 15:26:01 +09:00
f17b8de907 fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined 2024-03-07 15:22:08 +09:00
dbollweg
d87296f3e8 Merge branch 'develop' of https://github.com/dbollweg/Grid into develop 2024-03-06 16:54:22 -05:00
dbollweg
be94cf1c6f Fewer wait-calls in sycl slicesum 2024-03-06 16:53:13 -05:00
Peter Boyle
cc04dc42dc Merge branch 'develop' into feature/scidac-wp1 2024-03-06 14:55:21 -05:00
Peter Boyle
070b61f08f Simplifying the MultiRHS solver to make it do SRHS *and* MRHS 2024-03-06 14:04:33 -05:00
Peter Boyle
7e5bd46dd3 Booster update 2024-03-06 19:03:45 +01:00
Peter Boyle
228bbb9d81 Benchmark results 2024-03-06 19:03:35 +01:00
b812a7b4c6 Staggered launch script 2024-03-06 01:32:40 +00:00
891a366f73 Repro CG script 2024-03-06 01:22:55 +00:00
10116b3be8 Force device copyable and tell SYCL to shut it. 2024-03-06 01:13:27 +00:00
a46a0f0882 force device copyable and don't take crap from SYCL 2024-03-06 01:12:49 +00:00
a26a8a38f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-06 00:05:00 +00:00
7435315d50 More blasted shell variables 2024-03-06 00:03:59 +00:00
9b5f741e85 Reproducing CG can be more useful now 2024-03-06 00:03:16 +00:00
517822fdd2 SPR HBM benchmarking right and also PVC batched GEMM 2024-03-06 00:02:27 +00:00
1b93a9be88 Print out the hostname 2024-03-06 00:01:58 +00:00
783a66b348 Deterministic reduction please 2024-03-06 00:01:37 +00:00
976c3e9b59 Hack for flight logging CG inner products.
Can be made to work, but could put in some more serious infrastructure
for repro testing and blame attribution (Britney test) if necessary
2024-03-05 23:59:57 +00:00
f8ca971dae Use of a bare PRECISION macro is not namespace safe and collides with
SYCL
2024-03-05 23:59:13 +00:00
21bc8c24df OneMKL batched blas starting 2024-03-05 23:58:20 +00:00
30228214f7 SYCL conflict with Eigen 2024-03-05 23:56:10 +00:00
Peter Boyle
2ae980ae43 Update sourceme.sh 2024-03-05 13:39:18 -05:00
Peter Boyle
6153dec2e4 Update setup.sh 2024-03-05 13:38:32 -05:00
Peter Boyle
c805f86343 USQCD benchmark 2024-03-01 00:05:04 -05:00
Peter Boyle
04ca065281 Only one rank opens 2024-02-29 20:09:11 -05:00
Peter Boyle
88d8fa43d7 Benchmark development 2024-02-29 20:01:44 -05:00
Peter Boyle
3c49762875 Propagate in the blas routine 2024-02-29 15:33:06 -05:00
Peter Boyle
436bf1d9d3 Merge pull request #455 from clarkedavida/hisq_fat_links
Hisq fat links
2024-02-29 15:29:39 -05:00
david clarke
f70df6e195 changed NO_SHIFT and BACKWARD_CONST from define to enum 2024-02-29 12:29:30 -07:00
Peter Boyle
fce3852dff Merge pull request #451 from paboyle/feature/eigen-3.4.0-update
updating Eigen to 3.4.0
2024-02-28 18:03:37 -05:00
Peter Boyle
ee1b8bbdbd Merge pull request #454 from edbennett/adjoint-broke
fix HMC for non-fundamental representations
2024-02-28 14:05:27 -05:00
Peter Boyle
3f1636637d Merge pull request #453 from dbollweg/feature/sliceSum_gpu
Feature/slice sum gpu
2024-02-28 14:04:43 -05:00
Peter Boyle
2e570f5300 Merge pull request #457 from lehner/feature/gpt
Import GPT-related updates
2024-02-28 13:59:04 -05:00
Christoph Lehner
9f89486df5 remove unnecessary code path 2024-02-28 19:56:23 +01:00
Christoph Lehner
22b43b86cb Make GPT test suite work with SYCL 2024-02-28 12:57:17 +01:00
dbollweg
3c9012676a CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case. 2024-02-27 12:41:45 -05:00
Peter Boyle
ee3b3c4c56 relocate deflation support 2024-02-27 11:52:23 -05:00
Peter Boyle
462d706a63 Move to a blas directory 2024-02-27 11:51:04 -05:00
Peter Boyle
ee0d460c8e Blas based block project & deflate for multiRHS 2024-02-27 11:41:44 -05:00
Peter Boyle
cd15abe9d1 Mrhs prep 2024-02-27 11:41:13 -05:00
Peter Boyle
9f40467e24 Warning squash 2024-02-27 11:40:36 -05:00
Peter Boyle
d0b6593823 More verbose on checksum 2024-02-27 11:40:14 -05:00
Peter Boyle
79fc821d8d reorg headers 2024-02-27 11:39:37 -05:00
Peter Boyle
d7fdb9a7e6 Reorg headers 2024-02-27 11:39:06 -05:00
Peter Boyle
b74de51c18 Reorder headers 2024-02-27 11:38:52 -05:00
Dennis Bollweg
b507fe209c Added SpinColourMatrix case to sliceSum Test 2024-02-27 11:28:32 -05:00
Dennis Bollweg
6cd2d8fcd5 Replace cuda/hip memcpy with Grid functions 2024-02-26 09:55:07 -05:00
david clarke
b02d022993 fixed race condition (thx michael) 2024-02-23 17:14:28 -07:00
david clarke
94581e3c7a accelerator_for is broken 2024-02-23 15:58:33 -07:00
david clarke
88b52cc045 Merge branch 'develop' into hisq_fat_links 2024-02-23 14:47:15 -07:00
dbollweg
0a816b5509 Merge branch 'feature/sliceSum_gpu' of https://github.com/dbollweg/Grid into feature/sliceSum_gpu 2024-02-22 21:43:06 -05:00
dbollweg
1c8b807c2e free malloc'd memory 2024-02-22 21:42:44 -05:00
Peter Boyle
44b466e072 Make InsertSliceFast the default at some point in future.
Should I do this now?
2024-02-21 14:51:24 -05:00
Peter Boyle
5e5b471bb2 Put/Get and DEviceToDevice 2024-02-21 14:47:06 -05:00
Peter Boyle
9c2565f64e Working and faster version 2024-02-21 14:46:43 -05:00
Peter Boyle
e1d0a7cec3 Batched blas 2024-02-21 14:38:20 -05:00
Peter Boyle
b19ae8f465 Nbasis method for convenience 2024-02-21 14:36:19 -05:00
Peter Boyle
cdff2c8e18 Updated mrhs adef 2024-02-21 14:27:19 -05:00
Christoph Lehner
66391f84f2 Merge branch 'feature/gpt' of ../Grid into develop 2024-02-21 19:05:00 +01:00
97f7a9ecb3 fix HMC for non-fundamental representations 2024-02-21 08:27:55 +00:00
Dennis Bollweg
15878f7613 sliceSumReduction_cub_large now also faster than CPU on Frontier 2024-02-16 13:55:21 -05:00
dbollweg
e0d5e3c6c7 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-16 13:16:37 -05:00
dbollweg
6f3455900e Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs 2024-02-16 13:15:02 -05:00
david clarke
56827d6ad6 accelerator_inline bug 2024-02-14 13:56:57 -07:00
73c0b29535 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-02-13 20:19:32 +00:00
303b83cdb8 Scaling benchmarks, verbosity and MPICH aware in acceleratorInit()
For some reason Dirichlet benchmark fails on several nodes; need to
debug this.
2024-02-13 19:48:03 +00:00
5ef4da3f29 Silence verbose 2024-02-13 19:47:36 +00:00
1502860004 Benchmark scripts 2024-02-13 19:47:02 +00:00
585efc6f3f More benchmark scripts 2024-02-13 19:40:49 +00:00
62055e04dd missing semicolon generates error with some compilers 2024-02-13 18:18:27 +01:00
e4a641b64e removing old Eigen tensor patch 2024-02-13 10:37:14 +01:00
8849f187f1 updating Eigen to 3.4.0 2024-02-13 10:30:22 +01:00
david clarke
db420525b3 fix Simd::Nsimd typo 2024-02-12 15:03:53 -07:00
dbollweg
b5659d106e more test cases 2024-02-09 13:37:14 -05:00
dbollweg
4b43307402 Undo include path changes for level zero api header 2024-02-09 13:07:56 -05:00
dbollweg
09af8c25a2 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-09 13:02:59 -05:00
dbollweg
9514035b87 refactor slicesum: slicesum uses GPU version by default now 2024-02-09 13:02:28 -05:00
david clarke
2da09ae99b acceleration compiles and doesn't break scalar mode 2024-02-06 18:40:13 -07:00
david clarke
a38fb0e04a first effort toward accelerators 2024-02-06 18:24:55 -07:00
7019916294 RNG seed change safer for large volumes; this is a long term solution 2024-02-07 00:56:39 +00:00
dbollweg
1514b4f137 slicesum_sycl passes test 2024-02-06 19:08:44 -05:00
91cf5ee312 Updated bench script 2024-02-06 23:45:10 +00:00
david clarke
0a6e2f42c5 small amount of cleanup 2024-02-06 16:32:07 -07:00
dbollweg
ab2de131bd work towards sliceSum for sycl backend 2024-02-06 13:24:45 -05:00
5bfa88be85 Aurora MPI standalone benchmake and options that work well 2024-02-06 16:28:40 +00:00
Dennis Bollweg
5af8da76d7 Fix cuda compilation of Lattice_slicesum_gpu.h 2024-02-01 18:02:30 -05:00
Dennis Bollweg
b8b9dc952d Async memcpy's and cleanup 2024-02-01 17:55:35 -05:00
Dennis Bollweg
79a6ed32d8 Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies 2024-02-01 16:41:03 -05:00
dbollweg
caa5f97723 Add sliceSum gpu using cub/hipcub 2024-01-31 16:50:06 -05:00
david clarke
4924b3209e projectU3 yields a unitary matrix 2024-01-23 14:43:58 -07:00
Peter Boyle
eb702f581b Running on 12 rhs on 18 nodes of frontier 2024-01-22 17:44:15 -05:00
Peter Boyle
3d13fd56c5 Precompute phases, save memory in hermitian 2024-01-22 17:43:35 -05:00
Peter Boyle
6f51b49ef8 Use stderr 2024-01-22 17:41:09 -05:00
Peter Boyle
addc638856 Fast localCopyRegion, blockProjectFast 2024-01-22 17:40:38 -05:00
david clarke
00f24f8765 already found some bugs in projection, still needs testing 2024-01-22 05:50:16 -07:00
david clarke
f5b3d582b0 first attempt at U3 projection 2024-01-22 02:49:40 -07:00
david clarke
981c93d67a update Test_fatLinks to accept Naik 2024-01-21 21:09:19 -07:00
david clarke
c020b78e02 Merge branch 'develop' into hisq_fat_links 2024-01-21 20:21:08 -07:00
Peter Boyle
42ae36bc28 WOrking 2024-01-17 16:39:14 -05:00
Peter Boyle
c69f73ff9f Working 2024-01-17 16:38:46 -05:00
Peter Boyle
ca5ae8a2e6 Revert to working. 2024-01-17 16:32:05 -05:00
Peter Boyle
d967eb53de Working for first time 2024-01-17 16:31:12 -05:00
Peter Boyle
839f9f1bbe Don't log memory by default 2024-01-17 16:25:50 -05:00
Peter Boyle
b754a152c6 Flag guard correctly 2024-01-17 16:25:28 -05:00
Peter Boyle
e07cb2b9de Accelerator memory 2024-01-17 16:24:31 -05:00
Peter Boyle
a1f8bbb078 accelerator memory print 2024-01-17 16:24:09 -05:00
Peter Boyle
7909683f3b MultiRHS 2024-01-17 16:21:07 -05:00
Peter Boyle
25f71913b7 MultiRHS coarse 2024-01-04 12:01:17 -05:00
Peter Boyle
34ddd2b7b1 MultiRHS coarse space 2024-01-04 12:00:53 -05:00
Peter Boyle
d5fd90b2f3 Add 48^3 rtest 2024-01-04 12:00:01 -05:00
Peter Boyle
b7c7000d0d Don't need the numerical rounding tolerance in multigrid 2023-12-22 18:10:23 -05:00
Peter Boyle
551f6c4edd Synchronise changes 2023-12-22 18:09:11 -05:00
Peter Boyle
defd814750 Speed up the coarsened matrix matrix evaluation.
It is block project limited.
Could be sped up with calls to Batched GEMM and a data layout change.
2023-12-22 18:07:03 -05:00
Peter Boyle
3d517bbd2a Synchronise decouple from the launch
Speeds up multileg stencils
2023-12-22 18:06:13 -05:00
Peter Boyle
78ab955fec Better padded cell exchange 2023-12-22 18:05:41 -05:00
Peter Boyle
dd13937bb6 Better opt face gather scatter 2023-12-22 18:03:38 -05:00
Peter Boyle
66a1b63aa9 Faster grid/blas layout change.
Halo exchange is now the only slow part.
Revisit
2023-12-21 20:50:18 -05:00
Peter Boyle
22c611bd1a Delete temp file 2023-12-21 18:32:31 -05:00
Peter Boyle
c9bb1bf8ea Passing new BLAs based 2023-12-21 18:31:17 -05:00
2a0d75bac2 Aurora files 2023-12-21 23:20:17 +00:00
Peter Boyle
9e489887cf General coarse multiRHS move to BLAS implementation 2023-12-21 15:24:48 -05:00
Peter Boyle
9feb801bb9 Much simpler GPU implementation 2023-12-21 15:24:06 -05:00
Peter Boyle
c00b495933 Multigrid 2023-12-21 15:23:31 -05:00
Peter Boyle
d22eebe553 BLas options 2023-12-21 15:23:03 -05:00
Peter Boyle
8bcbd82680 BLAS based layout and implementation 2023-12-21 15:21:24 -05:00
Peter Boyle
dfa617c439 Batched SGEMM/DGEMM/ZGEMM/CGEMM
Hip, Cuda version and vanilla CPU
One MKL stub in comments, to be tested as different.
2023-12-21 14:01:18 -05:00
Peter Boyle
48d1f0df89 Optimised partially, working 2023-12-21 12:33:47 -05:00
Peter Boyle
b75cb7a12c Blas batched partial implementation on Frontier only for now 2023-12-21 12:31:33 -05:00
Peter Boyle
332563e037 Debugged, reducing verbose 2023-12-21 12:30:57 -05:00
Peter Boyle
0cce97a4fe verbosity only 2023-12-20 21:30:10 -05:00
Peter Boyle
95a8e4be64 rocblas 2023-12-20 21:27:59 -05:00
Peter Boyle
abcd6b8cb6 Faster version 2023-12-19 15:17:46 -05:00
Peter Boyle
e8f21c9b6d Memmory verbose control improvement 2023-12-19 15:16:58 -05:00
Peter Boyle
f48298ad4e Bug fix 2023-12-11 20:57:02 -05:00
root
645e47c1ba Config for Ampere Altra ARM 2023-12-08 16:17:56 -05:00
Peter Boyle
d1d9827263 Integrator logging update 2023-12-08 12:14:00 -05:00
Peter Boyle
e054078b11 Verbose 2023-12-05 16:15:17 -05:00
Peter Boyle
14643c0aab SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) 2023-12-04 15:45:57 -05:00
Peter Boyle
b77a9b8947 SDDC compiles starting 2023-11-30 14:31:51 -05:00
Peter Boyle
6835a7f208 Better logging, test on 81 point stencil 2023-11-29 19:20:47 -05:00
Peter Boyle
f59993b979 Nbasis§ 2023-11-29 09:47:36 -05:00
Peter Boyle
2290b8f680 Verbose 2023-11-29 09:47:04 -05:00
Peter Boyle
2c54be651c Further updates 2023-11-29 09:43:29 -05:00
Peter Boyle
e859a199df Reduce volume to interior for coarse stencil -- worth up to 4x gain 2023-11-28 10:23:16 -05:00
Peter Boyle
0a3682ad0b MultiRHS work 2023-11-28 07:43:37 -05:00
Peter Boyle
59abaeb5cd Time stamp 2023-11-24 12:56:45 -05:00
Peter Boyle
3e448435d3 Restrict to interior 2023-11-23 18:23:29 -05:00
Peter Boyle
a294bc3c5b Relax constraints for multiRHS 2023-11-23 18:20:42 -05:00
Peter Boyle
b302ad3d49 multiRHS test in place, passes Yay! 2023-11-23 18:20:15 -05:00
Peter Boyle
82fc4b1e94 Finalise 2023-11-23 18:19:41 -05:00
Peter Boyle
b4f1740380 Finalise message 2023-11-23 18:19:16 -05:00
Peter Boyle
031f85247c multRHS initial support -- needs optimisation for multi project/promote.
Bug fix in freeing intermediate grids to stop double free
2023-11-23 18:18:35 -05:00
Peter Boyle
639cc6f73a better support for multiRHS coarse space
Still to add restriction of domain of last loop to interior of padded cell (expect about 4.5x on test volume on Crusher)
2023-11-23 18:16:26 -05:00
Peter Boyle
09946cf1ba Improved, works on 48^3 moving to multiRHS optimisations 2023-11-15 18:03:05 -05:00
Peter Boyle
f4fa95e7cb Use 5.3.0 2023-11-15 18:01:38 -05:00
Peter Boyle
100e29e35e Allow expression as argument to norm2 2023-11-15 18:00:44 -05:00
Peter Boyle
4cbe471a83 devVector 2023-11-15 18:00:07 -05:00
Peter Boyle
8bece1f861 Faster to transpose the matrix and apply with column major order 2023-11-15 17:58:38 -05:00
Peter Boyle
a3ca71ec01 Lots more setup options, still working on them 2023-11-15 17:58:04 -05:00
Peter Boyle
e0543e8af5 Implement flexible preconditioned CG 2023-11-15 17:57:39 -05:00
Peter Boyle
c1eb80d01a Print which have converged 2023-11-15 17:57:08 -05:00
Peter Boyle
a26121d97b Better printing 2023-11-15 17:56:45 -05:00
Peter Boyle
043031a757 Report resid on failed convergence 2023-11-15 17:56:22 -05:00
Peter Boyle
807aeebe4c Resize tol in constructor 2023-11-15 17:55:57 -05:00
Peter Boyle
8aa1a37aad For Mirs preconditioner solver 2023-11-15 17:55:32 -05:00
Peter Boyle
7d077fe493 Frontier compiel 2023-11-09 13:58:44 -05:00
david clarke
9cd4128833 fix naik bug 2023-11-03 14:11:38 -06:00
david clarke
c8b17c9526 Naik to CShift 2023-11-02 12:43:22 -06:00
david clarke
2ae2a81e85 attempt to fix Naik 2023-10-31 13:54:55 -06:00
david clarke
69c869d345 fixed stupid typo 2023-10-30 17:41:52 -06:00
david clarke
df9b958c40 naik now returns separately 2023-10-30 17:40:53 -06:00
david clarke
3d3376d1a3 LePage works, trying Naik 2023-10-27 16:26:31 -06:00
Peter Boyle
4efa042f50 C++17 change 2023-10-24 10:57:50 -04:00
Peter Boyle
c7cb37e970 c++17 accepted 2023-10-24 10:57:24 -04:00
Peter Boyle
d34b207eab Avoid HIP warnings 2023-10-24 10:57:04 -04:00
Peter Boyle
0e6fa6f6b8 DOn't need the Cshift for the period optimisation 2023-10-24 10:56:31 -04:00
Peter Boyle
38b87de53f This works around a stacksize limit on AMD GPU 2023-10-24 10:56:07 -04:00
Peter Boyle
aa5047a9e4 Faster blockProject blockPromote 2023-10-24 10:49:55 -04:00
Peter Boyle
24b6ee0df9 M4 file 2023-10-24 10:36:48 -04:00
Peter Boyle
1e79cc9cbe Avoid compiler error 2023-10-24 10:36:09 -04:00
Peter Boyle
b3925df9c3 Verbose on CPU-GPU xfer, remove performance by default 2023-10-24 10:25:01 -04:00
Christoph Lehner
f2648e94b9 getHostPointer added to Lattice 2023-10-23 13:47:41 +02:00
Peter Boyle
351795ac3a Better messaging 2023-10-20 19:33:04 -04:00
Peter Boyle
9c9c42d0df Tests on frontier with real speed up . 3.5x on 16^3 at mq=0.01 2023-10-20 19:27:13 -04:00
Peter Boyle
b6ad1bafc7 Normal memory SendToRecvFrom asynchronous for use in general stencil
code
2023-10-20 19:27:13 -04:00
Peter Boyle
a5ca40f446 Better verbose -- track CPU GPU motion under --log Memory, others go to
debug output stream
2023-10-20 19:27:13 -04:00
Peter Boyle
9ab54c5565 Overlap comms & data copy/buffer assembly in Ghost zone exchange 2023-10-20 19:27:13 -04:00
Peter Boyle
4341d96bde Massively sped up coarse grid mult, comms
Save 3ms spend (60% of time !) on cudaMalloc !!
2023-10-20 19:27:13 -04:00
Peter Boyle
5fac47a26d Faster halo exchange 2023-10-20 19:27:13 -04:00
Peter Boyle
e064f17346 Faster halo exchange 2023-10-20 19:27:13 -04:00
Peter Boyle
afe10ba2a2 More digits 2023-10-20 19:27:13 -04:00
Peter Boyle
7cc3435ba8 Imporved General coarsened matrix 2023-10-20 19:27:13 -04:00
Peter Boyle
541772313c Verbosity 2023-10-20 19:27:13 -04:00
Peter Boyle
3747494a09 Notify delet public 2023-10-20 19:27:13 -04:00
Peter Boyle
f2b98d0dcc Const safety 2023-10-20 19:27:13 -04:00
Peter Boyle
80471bf762 Alternate implementation involving face operations 2023-10-20 19:27:13 -04:00
Peter Boyle
a06f63c110 Improved I/O and non-lexico option exposed to SciDAC format 2023-10-20 19:27:13 -04:00
Peter Boyle
0ae4478cd9 Checkpoint the subspace and ldop 2023-10-20 19:27:13 -04:00
Peter Boyle
ae4e705e09 Use random vec as easier for debug 2023-10-20 19:27:13 -04:00
Peter Boyle
f5dcea9dbf Updates for Frontier 2023-10-20 19:27:12 -04:00
david clarke
21ed6ac0f4 added floating-point support 2023-10-20 13:54:26 -06:00
david clarke
7bb8ab7000 improve smearing templating 2023-10-20 08:41:02 -06:00
david clarke
2c824c2641 Merge branch 'develop' into hisq_fat_links 2023-10-17 16:03:59 -06:00
david clarke
391fd9cc6a try lepage term 2023-10-17 14:57:15 -06:00
Peter Boyle
2207309f8a Spack rules 2023-10-16 18:38:24 -04:00
Peter Boyle
51051df62c 3GeV run setup 2023-10-16 20:49:52 +03:00
Peter Boyle
33097681b9 FTHMC compiled and merged to develop 2023-10-14 00:42:55 +03:00
Peter Boyle
07e4900218 FTHMC commit 2023-10-13 18:21:57 +03:00
Peter Boyle
36ab567d67 FTHMC 3 Gev 2023-10-13 18:21:57 +03:00
Peter Boyle
e19171523b FTHMC Status at lattice conference commit 2023-10-13 18:21:56 +03:00
Peter Boyle
9626a2c7c0 Asynch handling 2023-10-13 18:21:56 +03:00
Peter Boyle
e936f5b80b IfGridTensor shorthand 2023-10-13 18:21:56 +03:00
Peter Boyle
ffc0639cb9 Running in HMC tests 2023-10-13 18:21:56 +03:00
Peter Boyle
c5b43b322c traceProduct eliminates non-contributing intermediate terms 2023-10-13 18:21:56 +03:00
Peter Boyle
c9c4576237 Improved frontier cshift 2023-10-13 18:21:56 +03:00
david clarke
bf4369f72d clean up HISQSmear with decltypes 2023-10-12 12:41:06 -06:00
david clarke
36600899e2 working 7-link; Grid_log; generalShift 2023-10-12 11:11:39 -06:00
david clarke
b9c70d156b Merge branch 'develop' into hisq_fat_links 2023-10-10 22:44:17 -06:00
david clarke
eb89579fe7 Merge remote-tracking branch 'origin/develop' into develop 2023-10-10 22:43:51 -06:00
david clarke
0cfd13d18b 7-link working 2023-10-10 22:41:52 -06:00
Christoph Lehner
e6ed516052 merged 2023-10-08 09:00:37 +02:00
Christoph Lehner
e2a3dae1f2 Option for multiple simultaneous CartesianStencils 2023-10-08 08:58:44 +02:00
Peter Boyle
2111e7ab5f Run at physical mass 2023-10-06 21:20:21 -04:00
Peter Boyle
d29abfdcaf Transfer code to Frontier now 2023-10-06 21:03:34 -04:00
Peter Boyle
a751c42cc5 Checkpoint restore the setup 2023-10-06 21:03:08 -04:00
Peter Boyle
6a3bc9865e Verbose change 2023-10-06 21:02:04 -04:00
Peter Boyle
4d5f7e4377 Verbose change 2023-10-06 21:01:37 -04:00
Peter Boyle
78b117fb78 Comment fix 2023-10-06 21:01:15 -04:00
Peter Boyle
ded63a1319 Verbose change/pretty print 2023-10-06 21:00:53 -04:00
Peter Boyle
df3e4d1e9c Return fix 2023-10-06 21:00:21 -04:00
Peter Boyle
b58fd80379 I/O for coarse op and reorganise multigrid headers 2023-10-06 13:43:46 -04:00
Peter Boyle
7f6e0f57d0 No IO in file 2023-10-06 13:39:53 -04:00
Peter Boyle
cae27678d8 gpermute 2023-10-06 13:39:19 -04:00
Peter Boyle
48ff655bad Slightly less verbose 2023-10-06 10:47:52 -04:00
Peter Boyle
2525ad4623 Slight clean up 2023-10-06 10:47:32 -04:00
Peter Boyle
e7020017c5 Reorganise multigrid 2023-10-06 10:47:12 -04:00
Peter Boyle
eacebfad74 Reorganise multigrid into multiple headers 2023-10-06 10:46:21 -04:00
Peter Boyle
3bc2da5321 Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1 2023-10-05 16:57:59 -04:00
Peter Boyle
2d710d6bfd Optimised parameters for 16^3 2023-10-05 16:56:55 -04:00
Peter Boyle
6532b7f32b Eliminate older inefficient coarsening implementation 2023-10-05 16:56:15 -04:00
Peter Boyle
7b41b92d99 Only need to bad non-local dimensions 2023-10-05 16:55:48 -04:00
Peter Boyle
dd557af84b ADEF1 and ADEF2 2 level CG 2023-10-05 16:55:19 -04:00
Peter Boyle
59b9d0e030 coalesceRead the blockSum 2023-10-05 16:54:48 -04:00
Peter Boyle
b82eee4733 Hermitian dealing with 2023-10-05 16:54:14 -04:00
Peter Boyle
6a87487544 Running on Frontier, fix RNG big volume y2k, affecting 5D RNG 2023-10-05 16:50:59 -04:00
Peter Boyle
fcf5023845 Running on Frontier 2023-10-05 16:50:59 -04:00
Peter Boyle
c8adad6d8b First runs on Summit. PopulateAdag needs work 2023-10-05 16:50:54 -04:00
Peter Boyle
737d3ffb98 ADEF1 and 1 hop projection 2023-10-03 14:22:18 -04:00
Peter Boyle
6d0c2de399 Deprecate teh PVC directory and make a PVC-OEM generic PVC target with
no queueing system dependency -- just interactive scripts
2023-10-03 17:04:20 +00:00
Peter Boyle
7786ea9921 Bug fix in script 2023-10-03 09:58:44 -07:00
Peter Boyle
d93eac7b1c Performance regressed and is OK in icpx 2023.2 2023-10-03 15:53:14 +00:00
Peter Boyle
b01e67bab1 coalescedReadGeneralPermute now working 2023-10-02 17:46:57 -04:00
Peter Boyle
8a70314f54 Merge branch 'develop' into feature/scidac-wp1 2023-10-02 17:24:55 -04:00
Peter Boyle
afc316f501 Rename headers 2023-10-02 16:25:11 -04:00
Peter Boyle
f14bfd5c1b Relocate sub includes 2023-10-02 16:23:38 -04:00
Peter Boyle
c5f1420dea Merge remote-tracking branch 'LupoA/develop' into LupoA-develop 2023-10-02 16:22:35 -04:00
Peter Boyle
018e6da872 Merge pull request #440 from giltirn/feature/paddedcellgauge
Feature/paddedcellgauge
2023-10-02 10:00:42 -04:00
Peter Boyle
b77bccfac2 Merge pull request #444 from mmphys/feature/docX
Update doc complete list of Macports needed to build Grid on a fresh Mac
2023-10-02 09:57:11 -04:00
Peter Boyle
36ae6e5aba Fastest GPU version.
Need to work on the PaddedCell now to make much faster
2023-09-29 18:26:51 -04:00
Peter Boyle
9db585cfeb Temporary commit while optimisation is carried out 2023-09-29 17:11:35 -04:00
Peter Boyle
c564611ba7 Annoying hack that is useful to preserve for profiling 2023-09-29 17:11:12 -04:00
Peter Boyle
e187bcb85c Updating 2023-09-29 17:10:17 -04:00
Peter Boyle
be18ffe3b4 Further tuning and lanczos 2023-09-27 16:21:58 -04:00
Peter Boyle
0d63dce4e2 Timing info 2023-09-27 16:21:14 -04:00
Peter Boyle
26b30e1551 Flop count and projection to nearest neighbour (keeps redundant flops) 2023-09-27 16:20:11 -04:00
Peter Boyle
7fc58ac293 Verbose subspace init 2023-09-27 16:19:45 -04:00
Peter Boyle
3a86cce8c1 Compile 2023-09-27 16:19:18 -04:00
Peter Boyle
80359e0d49 Bland SYCL compile 2023-09-26 13:20:27 -07:00
Peter Boyle
3d437c5cc4 Making SYCL happy 2023-09-26 13:19:42 -07:00
Peter Boyle
37884d369f Coarse space is expensive, but gives a speed up in fine matrix multiplies now.
Down to optimisation
2023-09-25 17:24:19 -04:00
Peter Boyle
9246e653cd Basic non-local coarsening of operator test 2023-09-25 17:20:58 -04:00
Peter Boyle
64283c8673 Normal equations becomes linear function for easy base class pass aroudn 2023-09-25 17:19:39 -04:00
Peter Boyle
755002da9c Comparison convenience 2023-09-25 17:16:33 -04:00
Peter Boyle
31b8e8b437 Better messaging 2023-09-25 17:16:14 -04:00
Peter Boyle
0ec0de97e6 Adef2 implemented and working in an HDCG like context 2023-09-25 17:15:03 -04:00
Peter Boyle
6c3ade5d89 Improved the coarsening 2023-09-25 17:14:40 -04:00
Peter Boyle
980c5f9a34 Update chebyshev setup 2023-09-25 17:12:22 -04:00
david clarke
63d9b8e8a3 Merge remote-tracking branch 'origin/develop' into hisq_fat_links 2023-09-16 23:20:31 -06:00
david clarke
d247031c98 try 7-link 2023-09-16 23:18:16 -06:00
Peter Boyle
e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
Peter Boyle
ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
Peter Boyle
471ca5f281 Power method more iterations 2023-09-07 10:55:05 -04:00
Peter Boyle
e82ddcff5d Working getting closer to HDCG but some low level engineering work still needed
+ MUCH work on optimisation
2023-09-07 10:53:51 -04:00
Peter Boyle
b9dcad89e8 Test cases for coarsening with non-local stencil 2023-09-07 10:53:22 -04:00
Peter Boyle
993f43ef4a Even odd use case 2023-09-07 10:53:06 -04:00
Peter Boyle
2b43308208 First cut non-local coarsening 2023-08-25 17:38:07 -04:00
Peter Boyle
04a1ac3a76 First cut for non-local coarsening 2023-08-25 17:37:38 -04:00
Peter Boyle
990b8798bd Merge remote-tracking branch 'refs/remotes/origin/develop' into develop 2023-08-25 17:36:45 -04:00
Peter Boyle
b334a73a44 Stencil improvement 2023-08-25 17:35:10 -04:00
Peter Boyle
5d113d1c70 Odd address sanitizer complain 2023-08-25 17:34:18 -04:00
Peter Boyle
c14977aeab Random vector option for test purposes 2023-08-25 17:33:31 -04:00
Peter Boyle
3e94838204 Spread out improvement 2023-08-25 17:31:28 -04:00
Peter Boyle
c0a0b8ca62 NEON and address sanitiser 2023-08-25 17:30:30 -04:00
Peter Boyle
b8a7004365 Partial fraction test 2023-08-14 15:17:03 -04:00
david clarke
affff3865f Merge branch 'develop' into hisq_fat_links 2023-08-11 23:08:04 -06:00
david clarke
9c22655b5a Merge remote-tracking branch 'origin/develop' into develop 2023-08-11 23:06:42 -06:00
david clarke
99d879ea7f 5-link first attempt 2023-08-11 22:56:30 -06:00
Michael Marshall
bd56c95a6f Update documentation with complete list of Macports needed to build Grid on a fresh Mac 2023-07-14 13:50:06 +01:00
Peter Boyle
994512048e Merge pull request #439 from felixerben/bugfix/IRL_convergence
Bugfix/irl convergence
2023-07-12 16:32:26 -04:00
chillenzer
dbd8bb49dc Merge pull request #32 from LupoA/sp2n/develop
Sp2n/develop
2023-07-04 15:23:43 +00:00
Julian Lenz
3a29af0ce4 Fixed linker error 2023-07-04 16:08:44 +01:00
Julian Lenz
f7b79cdd45 Added test for ProjectSpn 2023-07-03 18:00:32 +01:00
Alessandro Lupo
075b9d22d0 adjoint rep implemented as 2indx symmetric 2023-07-02 13:58:31 +01:00
Alessandro Lupo
b92428f05f better test 2023-07-02 13:34:03 +01:00
Alessandro Lupo
34b11864b6 prettiest tests 2023-07-02 13:25:57 +01:00
Christopher Kelly
1dfaa08afb The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent
The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher
2023-06-28 15:11:24 -04:00
david clarke
9d263d9a7d fix bug in HISQSmearing; move benchmark b/c i don't understand how makefiles work 2023-06-28 10:05:34 -06:00
david clarke
9015c229dc add benchmark to see whether matrix multiplication is slower than read from object 2023-06-27 21:28:26 -06:00
Christopher Kelly
f44dce390f Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding
Fixed const correctness on PaddedCell methods
Fixed compile issues on Crusher
Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance
Optimized kernel for StaplePadded
Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings
2023-06-27 14:58:10 -04:00
Christopher Kelly
bb71e9a96a Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers
Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header
Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links
2023-06-27 11:23:30 -04:00
78bae9417c returning Nstop vectors even if not all meet true convergence criterion 2023-06-27 14:38:19 +01:00
dd170ead01 whitespace 2023-06-27 11:37:01 +01:00
014704856f do one more iteration if not all vectors converged 2023-06-27 11:33:30 +01:00
david clarke
a7eabaad56 rudimentary appendShift convenience method, which allows the user to append an arbitrary shift in one line 2023-06-26 23:59:28 -06:00
david clarke
eeb4703b84 develop wrappers to make the stencils easier to construct 2023-06-26 17:45:35 -06:00
david clarke
a07421b3d3 Merge branch 'develop' into hisq_fat_links 2023-06-26 13:51:32 -06:00
david clarke
cda53b4068 Merge remote-tracking branch 'origin/develop' into develop 2023-06-26 13:51:06 -06:00
Christopher Kelly
6f6844ccf1 Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array
Modified plaq+rectangle gauge actions to use the above
Added a test code to confirm the above changes
2023-06-26 15:48:47 -04:00
Christopher Kelly
4c6613d72c Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink
Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs
Removed optimized staple only being used for periodic gauge BCs; it is now always used
2023-06-26 10:20:23 -04:00
Peter Boyle
ee92e08edb Merge pull request #435 from fjosw/fix/warnings_in_WilsonKernelsImplementation
Unused variable in WilsonKernelsImplementation
2023-06-23 11:47:19 -04:00
Peter Boyle
c1dcee9328 Merge pull request #437 from fjosw/fix/stencil_debug
Added GridLogDebug to BuildSurfaceList debug message
2023-06-23 11:47:00 -04:00
Alessandro Lupo
559257bbe9 better documentation and filelist names 2023-06-23 16:16:48 +01:00
Peter Boyle
6b150961fe Better script 2023-06-23 18:09:25 +03:00
Alessandro Lupo
cff1f8d3b8 rm unused variables and formatting 2023-06-23 16:04:18 +01:00
Alessandro Lupo
f27d2083cd adjustments in SUn and Sp2n impl 2023-06-23 15:34:08 +01:00
Christopher Kelly
36cc9c524f Threaded the constructor of GeneralLocalStencil 2023-06-23 09:57:38 -04:00
Alessandro Lupo
2822487450 rm unncessary line 2023-06-23 14:55:23 +01:00
Alessandro Lupo
e07fafe46a minor adjustments to twoindex 2023-06-23 12:18:04 +01:00
Alessandro Lupo
063d290bd8 missing function 2023-06-23 11:11:20 +01:00
Alessandro Lupo
4e6194d92a Avoid code duplication in ProjectSUn 2023-06-23 11:03:50 +01:00
Alessandro Lupo
de30c4e22a minor improvements 2023-06-23 10:49:41 +01:00
david clarke
df99f227c1 include missing staple orientations; invert path direction, which was backwards 2023-06-22 14:57:10 -06:00
Peter Boyle
5bafcaedfa Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-22 19:59:45 +03:00
Peter Boyle
bfeceae708 FTHMC 2023-06-22 12:58:18 -04:00
Peter Boyle
eacb66591f Config command 2023-06-22 19:56:40 +03:00
Peter Boyle
fadaa85626 Update 2023-06-22 19:56:27 +03:00
Peter Boyle
02a5b0d786 Updating run during testing 2023-06-22 19:52:46 +03:00
Peter Boyle
0e2141442a Dennis says broken 2023-06-22 19:19:51 +03:00
Peter Boyle
769eb0eecb Precision coverage 2023-06-22 19:19:20 +03:00
Christopher Kelly
4241c7d4a3 Imported coalescedReadGeneralPermute GPU implementation from Christoph
Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed
Fixed compile issue with pointer cast in padded staple code
Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple
2023-06-21 16:01:01 -04:00
david clarke
d536c67b9d add HISQSmearing to Smearing.h 2023-06-20 16:04:48 -06:00
david clarke
f44f005dad rename _lvl1 --> _linkTreatment 2023-06-20 15:48:27 -06:00
david clarke
26b2caf570 add template parameter to Smear_HISQ_fat for MILC interfacing 2023-06-20 15:37:54 -06:00
Christopher Kelly
7b11075102 The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs)
Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections
Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs
2023-06-20 17:09:56 -04:00
Christopher Kelly
abc658dca5 Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code
In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation
2023-06-20 16:14:25 -04:00
david clarke
8bb078db25 Merge branch 'develop' into hisq_fat_links 2023-06-20 13:05:00 -06:00
david clarke
b61ba40023 Merge remote-tracking branch 'origin/develop' into develop 2023-06-20 13:04:53 -06:00
Christoph Lehner
452bf2e907 Accelerator basisRotate also on HIP 2023-06-20 20:36:24 +03:00
Alessandro Lupo
2372275b2c Merge pull request #36 from LupoA/sp2n/gpu-bugfix
Sp2n/gpu bugfix [close #30]
2023-06-20 13:46:00 +01:00
chillenzer
ef736e8aa4 Merge pull request #35 from LupoA/sp2n/enableSp
consistent enable sp config flag
2023-06-20 10:41:09 +00:00
Julian Lenz
5e539e2d54 Forgot some follow-ups on changed signature 2023-06-18 12:37:51 +01:00
Julian Lenz
96773f5254 Apparently forgot to remove one Lattice version 2023-06-18 12:21:39 +01:00
Alessandro Lupo
d80df09f3b consistent enable sp config flag 2023-06-16 19:16:46 +01:00
Julian Lenz
621e612c30 Fix non-zero ret on device bug 2023-06-16 16:27:49 +01:00
Julian Lenz
8c3792721b ClangFormat 2023-06-16 15:58:23 +01:00
Julian Lenz
c95bbd3948 Remove accelerated lattice version 2023-06-16 15:50:26 +01:00
Julian Lenz
e28ab7a732 Re-included instantiations for symmetric 2Index AS Sp 2023-06-16 14:20:37 +01:00
Alessandro Lupo
c797cbe737 deal with post-merge trauma 2023-06-16 14:20:37 +01:00
Alessandro Lupo
e09dfbf1c2 definetely the right merge upstream/develop 2023-06-16 14:19:46 +01:00
85e35c4da1 fix: added GridLogDebug to BuildSurfaceList debug message. 2023-06-16 10:31:16 +01:00
Peter Boyle
d72e914cf0 Profiling temporary code until optimised 2023-06-15 10:43:04 -04:00
Peter Boyle
3b5254e2d5 Optional checkpoint smeared configs for FTHMC 2023-06-15 10:43:04 -04:00
Peter Boyle
f1c358b596 Additional tests 2023-06-15 10:43:04 -04:00
Peter Boyle
c0ef210265 Hot start should be properly Hot 2023-06-15 10:43:04 -04:00
Peter Boyle
e3e1cc1962 Ta project 2023-06-15 10:43:04 -04:00
Peter Boyle
723eadbb5c Keep methods virtual 2023-06-15 10:43:04 -04:00
Peter Boyle
e24637ec1e Clean up 2023-06-15 10:43:04 -04:00
Peter Boyle
8b01ff4ce7 Integrator over to smeared force structure 2023-06-15 10:43:04 -04:00
Peter Boyle
588197c487 Smeared action virtual class 2023-06-15 10:43:04 -04:00
Julian Lenz
116d90b0ee First attempt on #30 2023-06-15 15:09:37 +01:00
Julian Lenz
b0646ca187 Remove some unused variables 2023-06-15 15:09:09 +01:00
Peter Boyle
1352bad2e4 Sunspot compile 2023-06-15 11:22:46 +00:00
david clarke
14d352ea4f added smearParams struct 2023-06-12 16:55:44 -06:00
david clarke
1cf9ec1cce now compiles 2023-06-09 16:27:45 -06:00
chillenzer
4895ff260e Merge pull request #28 from LupoA/sp2n/config
compile sp2n fermion impl only if declared at config time
2023-06-09 13:07:48 +00:00
david clarke
4b994a1bc7 trouble with compilation 2023-06-08 17:37:25 -06:00
david clarke
e506d6d369 Merge branch 'develop' into hisq_fat_links 2023-06-07 21:16:20 -06:00
david clarke
ab56ad8d7a fix 3-link stencil 2023-06-07 21:14:58 -06:00
Alessandro Lupo
470d93006a compile sp2n fermion impl only if declared at config time 2023-06-07 12:53:33 +01:00
chillenzer
2f3d03f188 Merge pull request #27 from LupoA/sp2n/documentation
documentation for gaugegroup and sp2n
2023-06-01 16:42:27 +00:00
Alessandro Lupo
8db7c23bee improve documentation 2023-06-01 17:39:10 +01:00
chillenzer
69dc5172dc Merge pull request #26 from LupoA/sp2n/irreps
Sp2n/irreps
2023-06-01 16:28:15 +00:00
Julian Lenz
fd72eb6546 Merge branch 'sp2n/algorithm' into sp2n/irreps 2023-06-01 17:24:01 +01:00
Peter Boyle
ffd7301649 Updated masked / fthmc smeared config container 2023-06-01 06:23:02 -04:00
Peter Boyle
d2a8494044 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-01 06:22:33 -04:00
Peter Boyle
0982e0d19b Jacobian action wrapper for FTHMC 2023-06-01 06:15:08 -04:00
Peter Boyle
3badbfc3c1 Refactor the Action and Smeared gauge configuration containers. Add first pass at FTHMC action 2023-06-01 06:14:28 -04:00
Peter Boyle
5465961e30 New test for FTHMC portion 2023-06-01 06:14:04 -04:00
477b794bc5 fix: unused variable removed. 2023-05-29 14:08:53 +01:00
Christoph Lehner
e8c29e2fe5 Merge pull request #31 from paboyle/develop
Sync
2023-05-28 16:13:12 +02:00
Peter Boyle
4835fd1a87 HIP stream synch 2023-05-27 17:58:22 +03:00
Peter Boyle
6533c25814 Lumi 2023-05-27 16:13:32 +03:00
Alessandro Lupo
b405767569 make private methods private 2023-05-26 17:02:16 +01:00
Alessandro Lupo
fe88a0c12f cleaner twoindex class, cleaner tests 2023-05-26 16:55:30 +01:00
Alessandro Lupo
e61a9ed2b4 partial revert 2023-05-26 13:54:26 +01:00
Alessandro Lupo
de8daa3824 group is SUn by default 2023-05-26 13:44:41 +01:00
Alessandro Lupo
3a50fb29cb directly call sp helper 2023-05-26 13:28:47 +01:00
Alessandro Lupo
6647d2656f rm unnecessary specialisation 2023-05-26 12:27:22 +01:00
Alessandro Lupo
a6f4dbeb6d remove redundant template parameter 2023-05-26 12:13:40 +01:00
Alessandro Lupo
92a282f2d8 Merge pull request #24 from LupoA/sp2n/fix_static_assert_symmetric
Move static_assert inside of function
2023-05-26 11:13:50 +01:00
Alessandro Lupo
ca2fd9fc7b documentation for gaugegroup and sp2n 2023-05-25 18:40:54 +01:00
david clarke
3825329f8e Merge branch 'develop' into hisq_fat_links 2023-05-24 15:37:25 -06:00
Alessandro Lupo
be1a4f5860 implement TwoIndexSymm for sp2n 2023-05-22 17:21:03 +01:00
Peter Boyle
1b2914ec09 FT-HMC smearing, derivative chain rule, log det and force first pass. 2023-05-22 10:21:37 -04:00
Peter Boyle
519f795066 Header not liked by gcc on mac? puzzling 2023-05-22 10:21:12 -04:00
Alessandro Lupo
5897b93dd4 debug tests, fix dimension 2023-05-22 13:42:21 +01:00
Alessandro Lupo
af091e0881 DimensionHelper for 2index irreps 2023-05-21 16:56:06 +01:00
Alessandro Lupo
3c1e5e9517 Merge pull request #25 from LupoA/sp2n/unify_representations
Sp2n/unify representations [close #3]
2023-05-21 14:55:27 +01:00
Alessandro Lupo
85b2cb7a8a changing some hardcoded SUn lines 2023-05-21 14:50:28 +01:00
david clarke
c7bdf2c0e4 3-link test at least gives an answer 2023-05-21 04:33:20 -06:00
Peter Boyle
4240ad5ca8 Preparing for FTHMC 2023-05-19 21:21:55 -04:00
Peter Boyle
d418347d86 public for convenience to see rho params 2023-05-19 21:21:05 -04:00
Peter Boyle
29a4bfe5e5 Clean up 2023-05-19 21:20:45 -04:00
Peter Boyle
9955bf9daf Regresses to Qlat 2023-05-19 17:32:13 -04:00
Christoph Lehner
da9cbfc7cc Suppress BuildSurfaceList verbosity in Stencil.h 2023-05-19 20:22:20 +02:00
Christoph Lehner
6b9f07c1ed Merge pull request #30 from paboyle/develop
Merge upstream
2023-05-19 20:20:58 +02:00
Julian Lenz
b8bdc2eefb Unified two index representations 2023-05-18 18:36:29 +01:00
Julian Lenz
0078826ff1 Move static_assert inside of function 2023-05-18 18:14:53 +01:00
Julian Lenz
e855c41772 Unified spfundamental.h with fundamental.h 2023-05-18 18:11:20 +01:00
chillenzer
d169c275b6 Merge pull request #22 from LupoA/sp2n/unify_twoindex
Unify TwoIndex
2023-05-18 14:55:02 +00:00
Julian Lenz
a5125e23f4 Typo 2023-05-18 15:41:35 +01:00
Julian Lenz
7b83c80757 Merge branch 'sp2n/unify_twoindex' of github.com:LupoA/Grid into sp2n/unify_twoindex 2023-05-18 15:36:14 +01:00
Julian Lenz
e41821e206 Disable two index symmetric 2023-05-18 15:29:55 +01:00
david clarke
bf91778550 verbose plaquette example; fat link test frame 2023-05-17 15:15:54 -06:00
Alessandro Lupo
5a75ab15a2 typo in 2S dim 2023-05-17 20:47:57 +01:00
Alessandro Lupo
932c783fbf 2AS for every Nc! 2023-05-17 20:22:05 +01:00
Julian Lenz
55f9cce577 Revert "Added automated HMC test for Nc=4"
This reverts commit eee27b8b30.
2023-05-17 09:17:48 +01:00
Alessandro Lupo
b3533ca847 correct tests (failing) 2023-05-16 17:43:52 +01:00
Alessandro Lupo
fd2a637010 test 2index 2023-05-16 14:10:39 +01:00
Julian Lenz
eee27b8b30 Added automated HMC test for Nc=4 2023-05-15 18:37:33 +01:00
Julian Lenz
8522352aa3 ClangFormat 2023-05-15 18:36:05 +01:00
Alessandro Lupo
3beb8f4091 fixing typo, getting pre-changes physics 2023-05-15 16:00:15 +01:00
Alessandro Lupo
12a706e9b1 de-hardcode the number of generators 2023-05-15 15:48:21 +01:00
Alessandro Lupo
170aa7df01 fix (dimension to be improved) 2023-05-15 15:20:18 +01:00
Julian Lenz
e8ad1fef53 Unify TwoIndex 2023-05-12 14:35:50 +01:00
Peter Boyle
876c8f4478 Nodes on padded cell 2023-05-11 12:35:49 -04:00
Peter Boyle
9c8750f261 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-05-11 12:29:09 -04:00
Peter Boyle
91efd08179 Option for Qlat generator basis 2023-05-11 12:27:45 -04:00
Peter Boyle
9953511b65 Mac compile 2023-05-11 12:27:29 -04:00
Peter Boyle
025fa9991a For FTHMC 2023-05-11 12:26:14 -04:00
Peter Boyle
e8c60c355b Padded cell code 2023-05-11 12:25:50 -04:00
Peter Boyle
6c9c7f9d85 Permute fix 2023-05-11 12:24:21 -04:00
Peter Boyle
f534523ede Debug 2023-05-11 12:23:11 -04:00
Peter Boyle
1b8a834beb Debug 2023-05-11 12:22:24 -04:00
Alessandro Lupo
aa9df63a05 rename group projections based on determinants 2023-05-10 14:50:52 +01:00
chillenzer
3953312a93 Merge pull request #20 from LupoA/sp2n/unify_gaugeimpltypes
Sp2n/unify gaugeimpltypes
2023-05-03 15:17:10 +00:00
Julian Lenz
6e62f4f616 ClangFormat 2023-05-03 16:15:12 +01:00
Julian Lenz
6a7bdca53b Take over additional algebra tests from Alessandro 2023-05-03 16:02:02 +01:00
Julian Lenz
c7fba9aace Take over additional group tests from Alessandro 2023-05-03 16:01:48 +01:00
Julian Lenz
ac6c7cb8d6 Merge in Alessandro's changes [test fails] 2023-05-03 02:53:03 +01:00
Julian Lenz
c5924833a1 ClangFormat 2023-05-03 02:39:36 +01:00
Julian Lenz
ac0a74be0d Taken care of algebra tests 2023-05-03 02:32:42 +01:00
Julian Lenz
42b0e1125d Naming and argument types 2023-05-03 01:51:46 +01:00
Julian Lenz
339c4fda79 Extracted is_element_of Sp2n 2023-05-02 15:44:34 +01:00
Alessandro Lupo
9b85bf9402 better projection test 2023-05-02 15:42:20 +01:00
Alessandro Lupo
86b02c3cd8 cleaning up requested by Julian 2023-05-02 13:31:17 +01:00
Alessandro Lupo
7b3b7093fa cleaning up requested by Ed 2023-05-02 12:50:57 +01:00
Alessandro Lupo
881b08a465 Correct implementation of SpTa 2023-04-27 18:17:06 +01:00
Julian Lenz
3ee5444c69 Remove commented out stuff 2023-04-21 08:08:18 +01:00
Julian Lenz
5e28fe56d2 Remove code duplication: Iterating through vectors 2023-04-21 08:08:06 +01:00
Peter Boyle
3aa43e6065 Debug info 2023-04-20 14:21:13 -04:00
Peter Boyle
78ac4044ff HMC 2023-04-20 13:28:07 -04:00
Peter Boyle
119c3db47f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-18 15:13:16 -04:00
Peter Boyle
21bbdb8fc2 Crusher 2023-04-18 15:11:16 -04:00
Alessandro Lupo
5aabe074fe Rename Sympl* to Sp* 2023-04-18 11:50:20 +01:00
Peter Boyle
739bd7572c Example code 2023-04-17 21:51:55 +00:00
Peter Boyle
074627a5bd Pass file descriptors through AF_UNIX for level_zero 2023-04-17 21:50:52 +00:00
Peter Boyle
6a23b2c599 Drop UVM 2023-04-17 21:49:58 +00:00
Alessandro Lupo
dace904c10 fix typo 2023-04-14 18:06:18 +01:00
Alessandro Lupo
be98d26610 small change I missed in previous commit 2023-04-13 17:48:43 +01:00
Peter Boyle
bd891fb3f5 tests to compile 2023-04-12 18:32:44 -04:00
Peter Boyle
3984265851 Merge pull request #432 from paboyle/hotfix/nvcc-warnings
Unused statements generating warnings removed
2023-04-12 16:59:02 -04:00
Peter Boyle
45361d188f Merge pull request #427 from fjosw/feat/bug_report_issue_template
Feat/bug report issue template
2023-04-12 16:58:41 -04:00
Peter Boyle
80c9d77e02 Merge pull request #433 from paboyle/hotfix/virtual-dtor
Virtual destructor for LinearOperator
2023-04-12 16:56:18 -04:00
Peter Boyle
3aff64dddb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-11 12:19:15 -07:00
Peter Boyle
b4f2ca81ff Copy queue and compute queue same as better concurrency 2023-04-11 12:18:21 -07:00
Peter Boyle
d1dea5f840 New driver 2023-04-11 12:16:52 -07:00
Peter Boyle
54f8b84d16 Fence 2023-04-11 12:16:08 -07:00
Peter Boyle
da503fef0e Name change on barrier routine 2023-04-11 12:14:04 -07:00
Peter Boyle
4a6802098a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-07 15:43:28 -04:00
Peter Boyle
f9b41a84d2 Trajectory runs to completion on Crusher within wall clock time 2023-04-07 15:42:45 -04:00
5d7e0d18b9 virtual destructor for LinearOperator 2023-04-07 14:30:38 +01:00
9e64387933 mores unused statements removed 2023-04-07 14:27:18 +01:00
983b681d46 unused statement cleaning 2023-04-07 14:12:02 +01:00
4072408b6f Update README.md 2023-04-07 11:45:28 +01:00
bd76b47fbf Update CI badge in README 2023-04-07 11:44:48 +01:00
Christoph Lehner
5f75735dab Add M and Mdag to WilsonTMFermion 2023-04-06 18:25:05 +02:00
Alessandro Lupo
178376f24b minor stylistic changes 2023-04-06 12:08:17 +01:00
18ce23aa75 Fix NEON SIMD 2023-04-06 11:30:48 +01:00
chillenzer
6a0eb466ee Merge pull request #19 from LupoA/refactoring_sp2n
refactoring sp2n
2023-04-05 10:50:58 +00:00
Peter Boyle
ffa7fe0cc2 Merge branch 'feature/dirichlet' into develop 2023-04-04 23:13:52 -04:00
Peter Boyle
6b979f0a69 Dirichlet improvements that I failed to commit 2023-04-04 23:13:17 -04:00
Alessandro Lupo
4ea29b8f0f Template group into GaugeImplTypes. Closing #2 2023-04-04 17:49:28 +01:00
Alessandro Lupo
778291230a expand ProjecOnGaugeGroup, change ProjectOnSp2nAlgebra into SpTa, fixing some of its issues 2023-04-04 17:48:13 +01:00
Peter Boyle
86dac5ff4f Better printing 2023-04-04 07:42:19 -07:00
Peter Boyle
4a382fad3f Use distinct SYCL queue for copies 2023-04-04 07:41:41 -07:00
Peter Boyle
cc753670d9 Barrier elimination, surface list build 2023-04-04 07:39:14 -07:00
Peter Boyle
cc9d88ea1c Fence changes and EXT kernel loop cout reduction 2023-04-04 07:37:23 -07:00
Peter Boyle
b281b0166e Put the barrier in the subroutine 2023-04-04 07:36:03 -07:00
Peter Boyle
6a21f694ff Apply barrier in Gather kernel sequence.
Could place before comms, or in Gather, but decided to insist Gather means Gather is done
2023-04-04 07:33:24 -07:00
Peter Boyle
fc4db5e963 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-04-03 18:26:11 -04:00
Peter Boyle
6252ffaf76 No unified 2023-04-03 18:25:22 -04:00
Alessandro Lupo
026e736dfa Projection on algebra can now be templated. Fix #12 2023-04-03 16:31:19 +01:00
Alessandro Lupo
4275b3f431 Fix typo and remove unnecessary lines 2023-04-03 12:01:52 +01:00
Peter Boyle
af64c1c6b6 Had managed to drop the accelerator_barrier() in the Wilson Compressor gather 2023-03-30 17:34:44 -04:00
Peter Boyle
866f48391a Temporary fix for develop incorrect results 2023-03-30 17:10:13 -04:00
Peter Boyle
a4df527d74 Merge pull request #428 from mmphys/bugfix/comm_none
Fixes for --enable-comms=none
2023-03-30 08:38:14 -04:00
Michael Marshall
5764d21161 Fixes for --enable-comms=none 2023-03-30 10:15:28 +01:00
Peter Boyle
496d04cd85 Weaken the Fence 2023-03-29 18:58:51 -04:00
Peter Boyle
10e6d7c6ce Merge branch 'feature/dirichlet' into develop 2023-03-29 16:26:47 -04:00
Peter Boyle
c42e25e5b8 Dirichlet remove 2023-03-29 16:25:52 -04:00
Peter Boyle
a00ae981e0 Fence propagation from SYCL 2023-03-29 15:00:40 -04:00
Peter Boyle
58e020b62a Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-29 14:37:40 -04:00
Peter Boyle
a7e1aceeca Compile fix on Nvidia 2023-03-29 14:36:50 -04:00
Peter Boyle
7212432f43 More careful fencing 2023-03-28 20:10:22 -07:00
Peter Boyle
4a261fab30 Changes premerge to develop 2023-03-28 20:04:21 -07:00
Peter Boyle
6af97069b9 Preparing for close of feature/dirichlet
Initial code change review complete
2023-03-28 13:39:44 -07:00
Peter Boyle
5068413cdb Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 08:35:38 -07:00
Peter Boyle
71c6960eea Commet 2023-03-28 08:34:24 -07:00
Peter Boyle
ddf6d5c9e3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 11:33:05 -04:00
39214702f6 feat: indentation fixed. 2023-03-28 16:30:34 +02:00
3e4614c63a feat: draft for bug-report issue template added. 2023-03-28 16:24:35 +02:00
Peter Boyle
900e01f49b Temporary 2023-03-27 21:35:06 -07:00
Peter Boyle
2376156fbc Merge branch 'develop' into feature/dirichlet 2023-03-27 21:33:50 -07:00
Peter Boyle
3f2fd49db4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-03-27 17:29:54 -07:00
Peter Boyle
0efa107cb6 Script update 2023-03-27 17:29:43 -07:00
Peter Boyle
8feedb4f6f Include files moved 2023-03-27 17:29:21 -07:00
Peter Boyle
05e562e3d7 Move the copy synch out to stencil and do one per call instead of one per packet 2023-03-27 17:28:38 -07:00
Peter Boyle
dd3bbb8fa2 MOve the synchronise out to the stencil so one call instead of one call per packet 2023-03-27 17:27:45 -07:00
Peter Boyle
2fbcf13c46 SYCL fix 2023-03-27 14:25:14 -07:00
Peter Boyle
4ea48ef0c4 Merge pull request #419 from lehner/feature/gpt
Separate rankSum from sum
2023-03-24 15:42:16 -04:00
Peter Boyle
5c85774ee3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-24 15:40:57 -04:00
Peter Boyle
d8a9a745d8 stream synchronise 2023-03-24 15:40:30 -04:00
Peter Boyle
dcf172da3b Merge pull request #415 from paboyle/feature/block_lanczos22
Feature/block lanczos22
2023-03-24 12:08:16 -04:00
Peter Boyle
d57ed25071 Merge branch 'feature/dirichlet' into feature/block_lanczos22 2023-03-24 12:08:09 -04:00
Peter Boyle
546be724e7 Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane
Populate the Cshift_table in the GPU
2023-03-24 12:04:06 -04:00
Peter Boyle
8a1b9073f9 Mshift update 2023-03-23 15:39:30 -04:00
Peter Boyle
1a7114d4b9 Temporary algorithm while sorting out mixed prec 2023-03-23 15:38:35 -04:00
Peter Boyle
3f385f717c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts:
	systems/PVC/benchmarks/run-2tile-mpi.sh
	systems/PVC/config-command
2023-03-23 14:52:53 -04:00
Peter Boyle
481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
Peter Boyle
281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
Peter Boyle
c180a52518 Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-23 10:28:01 -04:00
Peter Boyle
90130e25e9 TODO list 2023-03-23 10:27:02 -04:00
Peter Boyle
23298acb81 Merge pull request #424 from giltirn/feature/dirichlet-precchange
Precision change implementation
2023-03-22 23:04:52 -04:00
Peter Boyle
52384e34cf Discard on construct 2023-03-22 19:40:32 -04:00
Peter Boyle
d0bb033ea2 Device resident GPU block buffer instead of UVM as hit likely UVM
bug. Code worked on CUDA 11.4 but fails on later drivers (certainly 530.30.02, but need to
find the perlmutter driver version).
2023-03-22 19:07:32 -04:00
Peter Boyle
c6621806ca Compiling on laptop and running 2023-03-21 17:27:09 -04:00
Peter Boyle
0b6f0f6d2f Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:06:55 -04:00
Peter Boyle
b5b759df73 Merge branch 'develop' into feature/dirichlet 2023-03-21 16:05:46 -04:00
Peter Boyle
7db8dd7a95 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:04:27 -04:00
Peter Boyle
8b43be39c0 Config command 2023-03-21 16:00:52 -04:00
Peter Boyle
f17f879206 Test update 2023-03-21 15:59:29 -04:00
Peter Boyle
68428fceab Integrator update 2023-03-21 15:58:49 -04:00
Peter Boyle
4135f2dcd1 Compressor 2023-03-21 15:41:41 -04:00
Peter Boyle
c5bdf61215 AUdit fix 2023-03-21 15:38:39 -04:00
Peter Boyle
88e218e8ee Stencil updates 2023-03-21 15:37:58 -04:00
Peter Boyle
0f2b786436 Vector -> vector 2023-03-21 15:36:11 -04:00
Peter Boyle
e1c326558a COmms improvements 2023-03-21 08:53:56 -07:00
Peter Boyle
bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
Peter Boyle
bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
Peter Boyle
39c0815d9e WriteDiscard 2023-03-21 08:57:29 -04:00
Alessandro Lupo
1b8176e2c0 fix code duplication 2023-03-17 14:58:00 +00:00
Alessandro Lupo
cbc053c3db Revert "projection on Sp2n algebra, to be used instead of Ta"
This reverts commit ba7f9d7b70.
2023-03-17 11:36:58 +00:00
Alessandro Lupo
cdf3f6ef6e Merge branch 'refactoring_sp2n' of https://github.com/LupoA/Grid into refactoring_sp2n 2023-03-15 15:59:50 +00:00
Alessandro Lupo
ba7f9d7b70 projection on Sp2n algebra, to be used instead of Ta 2023-03-15 15:55:12 +00:00
Peter Boyle
a997d24743 Remove nofma 2023-03-14 12:10:31 -07:00
Peter Boyle
861e5d7f4c SYCL version update. Why do they keep making incompatible changes 2023-03-14 12:10:02 -07:00
Peter Boyle
14cc142a14 Warning remove 2023-03-14 12:09:26 -07:00
Peter Boyle
f36b87deb5 syscall fix 2023-03-14 12:09:00 -07:00
Peter Boyle
eeb6e0a6e3 Renable cache blocking and efficient UPI type SHM comms 2023-03-14 09:10:27 -07:00
Peter Boyle
cad5b187dd Cleanup 2023-03-14 09:08:16 -07:00
Peter Boyle
87697eb07e SHared compile 2023-03-14 09:07:36 -07:00
Alessandro Lupo
371fd123fb consequence of iSUnMatrix being no longer a member of the SU class 2023-03-14 10:47:07 +00:00
Alessandro Lupo
d6ff644aab Towards the day all tests compile 2023-03-14 10:43:25 +00:00
Julian Lenz
29586f6b5e Deactivate some tests for Nc!=3 2023-03-13 08:17:14 +00:00
Alessandro Lupo
fd057c838f add ProjectOnGaugeGroup and ProjectGn to allow future templating in GaugeImplTypes 2023-03-10 12:10:46 +00:00
Alessandro Lupo
f51222086c Move functions from GaugeGroup to group specific implementations 2023-03-09 16:22:20 +00:00
a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
Christopher Kelly
83d86943db Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string 2023-02-23 13:09:45 -05:00
Christopher Kelly
e82cf1d311 Further prec-change improvements
Mixed prec CG algorithm has been modified to precompute precision change workspaces

As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test
2023-02-23 09:45:29 -05:00
Christopher Kelly
1db58a8acc Precision change improvements
Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation.

In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes.

Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces

Renamed the original precisionChange as precisionChangeOrig

Fixed incorrect pointer offset bug in copyLane

Added a test and a benchmark for precisionChange

Added a test for reliable update CG
2023-02-21 10:52:42 -05:00
920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
Alessandro Lupo
f73691ec47 Merge pull request #18 from nickforce989/sp2n/newbranch
Sp2n/newbranch
2023-02-13 10:22:27 +01:00
Peter Boyle
ccd21f96ff Plaquette agreeing and moving to final form (slowly) need to optimise 2023-02-01 22:57:44 -05:00
Peter Boyle
4b90cb8888 First cut passes combining padded cell with general stencil towards fast plaquette and staggered force 2023-02-01 22:14:10 -05:00
Niccolo Forzano
7ebda3e9ec Merge commit 'b10e1b7bc8bec809f874e9e48a3ccc7b2619c9d1' into sp2n/newbranch 2023-01-19 12:10:18 +00:00
Niccolo Forzano
b10e1b7bc8 Fixed files giving zero force computation on GPU, issue #8 2023-01-18 18:04:47 +00:00
Peter Boyle
796abfad80 Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT
Disable diagnostic pragma warnings for CUDA 12+
2023-01-17 09:34:49 -05:00
ad0270ac8c fix: diagnostic pragma warnings fixed for CUDA 12+ 2023-01-12 12:36:30 +00:00
Makis Kappas
7d62f1d6d2 Populate the Cshift_table in the GPU
Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
2023-01-11 21:26:25 +00:00
Christoph Lehner
458c943987 merged upstream 2022-12-31 11:16:21 +02:00
Christoph Lehner
88015b0858 Split sum in rankSum and GlobalSum 2022-12-26 10:01:32 +01:00
Peter Boyle
4ca1bf7cca Added gauge invariance test 2022-12-21 07:23:16 -05:00
Peter Boyle
2ff868f7a5 CPU open doesn't need to free space 2022-12-20 05:10:23 -05:00
Peter Boyle
ede02b6883 Memory manager debug Felix case 2022-12-20 05:10:23 -05:00
Peter Boyle
1822ced302 Bug fix 2022-12-20 05:10:23 -05:00
Peter Boyle
37ba32776f More logging 2022-12-20 05:10:23 -05:00
Peter Boyle
99b3697b03 More loggin 2022-12-20 05:10:23 -05:00
Peter Boyle
43a45ec97b SSC_START 2022-12-20 05:10:23 -05:00
Peter Boyle
b00a4142e5 A=A fix 2022-12-20 05:10:23 -05:00
Peter Boyle
3791bc527b Logging pulled in from dirichlet branch 2022-12-20 05:10:23 -05:00
Alessandro Lupo
d7dea44ce7 Merge pull request #17 from chillenzer/unify_gauge_groups
Fix compilation error in nvcc (closes #15)
2022-12-19 16:24:03 +00:00
Peter Boyle
d8c29f5fcf Updated FFT test for PETSc 2022-12-18 12:05:00 -05:00
Julian Lenz
37b6b82869 Fix file extensions 2022-12-18 16:12:56 +00:00
Julian Lenz
92ad5b8f74 Compiler error fix: NVCC requires names for templ. par. 2022-12-18 15:50:19 +00:00
Peter Boyle
281f8101fe Matt FFT test 2022-12-17 20:35:33 -05:00
Peter Boyle
472ed2dd5c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-12-17 20:17:09 -05:00
Peter Boyle
4f85672674 Simpler test for PETSc 2022-12-17 20:16:11 -05:00
Peter Boyle
dc747c54be Merge branch 'develop' into feature/dirichlet
Conflicts:
	Grid/qcd/action/fermion/WilsonCompressor.h
	Grid/stencil/Stencil.h
2022-12-13 08:24:58 -05:00
Peter Boyle
140684d706 Head to head vs HMC 2022-12-13 08:15:38 -05:00
Peter Boyle
5bb7ba92fa Test for DDHMC force term 2022-12-13 08:15:11 -05:00
Peter Boyle
b54d0f3c73 Smaller deltaH down to 7000s on t=0.5 trajectory 2022-12-13 08:14:27 -05:00
Peter Boyle
ff6777a98d Variable depth experiments 2022-12-13 08:13:51 -05:00
Peter Boyle
07acfe89f2 Merge pull request #417 from rrhodgson/feature/fermtoprop
Feature/fermtoprop
2022-12-06 12:45:03 -05:00
40234f531f FermToProp accelerator_for -> thread_for 2022-12-06 17:34:51 +00:00
d49694f38f PropToFerm fix 2022-12-06 15:48:54 +00:00
Alessandro Lupo
8c80f1c168 Merge pull request #14 from chillenzer/unify_gauge_groups
Unify gauge groups (closes #5)
2022-12-01 17:35:46 +00:00
Chulwoo Jung
dc6a38f177 Minor cleanup 2022-11-30 17:13:12 -05:00
Chulwoo Jung
82c1ecf60f Block lanczos added 2022-11-30 16:08:40 -05:00
Peter Boyle
67f569354e Partial dirichlet changes 2022-11-30 15:51:13 -05:00
Peter Boyle
97a098636d FermToProp 2022-11-30 15:36:35 -05:00
Peter Boyle
e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
Julian Lenz
0af7d5a793 Rename Grid/qcd/utils/<Group>_impl.h -> Grid/qcd/utils/<Group>.h 2022-11-30 17:12:00 +00:00
Julian Lenz
505fa49983 Renamed SUn.h -> GaugeGroup.h 2022-11-30 17:09:48 +00:00
Julian Lenz
7bcf33def9 Removed Sp2n.h 2022-11-30 16:59:46 +00:00
Julian Lenz
a13820656a Removed iSUnMatrix, etc. 2022-11-30 15:09:03 +00:00
Julian Lenz
fa71b46a41 Hide nsp 2022-11-30 14:44:23 +00:00
Julian Lenz
b8b3ae6ac1 Make helper functions private 2022-11-30 13:29:14 +00:00
Julian Lenz
55c008da21 Removed forward declaration 2022-11-30 13:12:21 +00:00
Julian Lenz
2507606bd0 With function overloading (still dirty). 2022-11-30 12:54:36 +00:00
Julian Lenz
7c2ad4f8c8 Attempt with SFINAE (failed) 2022-11-30 11:57:39 +00:00
Julian Lenz
54c8025aad Remove unnecessary pwd in scripts/filelist 2022-11-28 17:50:38 +00:00
Julian Lenz
921e23e83c Separated out everything SU specific 2022-11-28 17:47:50 +00:00
Julian Lenz
6e750ecb0e Remove apparently forgotten file 2022-11-28 16:33:46 +00:00
Julian Lenz
b8f1f5d2a3 Introduce GaugeGroup 2022-11-25 17:45:32 +00:00
Julian Lenz
9273f2937c Autoformat google style 2022-11-25 17:44:08 +00:00
Julian Lenz
1aa28b47ae Add existing test to check 2022-11-25 17:40:40 +00:00
Julian Lenz
629cb2987a Fix typo in Makefile.am 2022-11-25 17:40:21 +00:00
Julian Lenz
03235d6368 Fixed type in configure.ac 2022-11-25 16:57:40 +00:00
Alessandro Lupo
22064c7e4c Fixing #11 2022-11-25 13:10:29 +00:00
Peter Boyle
5fa573dfd3 partial send fix 2022-11-25 00:51:04 -05:00
Peter Boyle
f6402cb6c4 AUDIT removal 2022-11-25 00:50:33 -05:00
Peter Boyle
bae6c263dc Audit 2022-11-25 00:47:01 -05:00
Peter Boyle
d71672dca9 Bug fix 2022-11-25 00:46:35 -05:00
Peter Boyle
121c9e2ceb Tracing 2022-11-25 00:45:21 -05:00
Peter Boyle
63a30ae34f Tracing 2022-11-25 00:45:05 -05:00
Peter Boyle
7d8231ba32 Tracing 2022-11-25 00:44:57 -05:00
Peter Boyle
b690b1cbe9 Audit 2022-11-25 00:43:57 -05:00
Peter Boyle
c0fb20fc03 Audit check for wrongly locked data 2022-11-25 00:43:12 -05:00
Peter Boyle
bc9579dac6 Old code path removed 2022-11-25 00:40:45 -05:00
Peter Boyle
a5c77f8b95 Tracing moved in order 2022-11-25 00:40:27 -05:00
Alessandro Lupo
2de03e5172 Revert "Revert "Fixing issue #11: consistent use of ncolour and nsp""
This reverts commit 3af4929dda.
2022-11-23 19:40:28 +00:00
Alessandro Lupo
3af4929dda Revert "Fixing issue #11: consistent use of ncolour and nsp"
This reverts commit 1ba429345b.
2022-11-23 19:34:59 +00:00
Alessandro Lupo
1ba429345b Fixing issue #11: consistent use of ncolour and nsp 2022-11-23 18:45:01 +00:00
Peter Boyle
3dbfce5223 Tests clean build on HIP 2022-11-16 20:15:51 -05:00
Peter Boyle
e51eaedc56 Making tests compile 2022-11-15 22:58:30 -05:00
Peter Boyle
e2a938e7f7 GPU happy for compile...? 2022-11-15 17:48:18 -05:00
Peter Boyle
ddad25211b Extra instantiations 2022-11-15 17:47:52 -05:00
Peter Boyle
6209120de9 Fix to GPU compile attempt 2022-11-15 17:25:58 -05:00
Peter Boyle
fe6e8f5ac6 Benchmark_comms fix 2022-11-15 17:00:49 -05:00
Peter Boyle
ee84dcb400 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-11-15 16:41:55 -05:00
Peter Boyle
0ae0e5f436 Partial Dirichlet test 2022-11-15 16:40:38 -05:00
Peter Boyle
e047616571 Multilevel integrator test 2022-11-15 16:39:39 -05:00
Peter Boyle
1af7572c61 Some test HMCs for DDHMC 2022-11-15 16:38:51 -05:00
Peter Boyle
653039695b Partial dirichlet changes 2022-11-15 16:37:15 -05:00
Peter Boyle
ca62abd203 Record some perturbative free field calculation 2022-11-15 16:36:46 -05:00
Peter Boyle
e74666a09c Double length vector type for fast precision change 2022-11-15 16:34:21 -05:00
Peter Boyle
45a001e078 Debug compile 2022-11-15 16:27:20 -05:00
Peter Boyle
0352da34f0 Several deleted files 2022-11-15 16:26:49 -05:00
Peter Boyle
7d302a525d Natural place for this routine is here 2022-11-15 16:24:55 -05:00
Peter Boyle
e2e269e03b Partial dirichlet BCs 2022-11-15 16:24:26 -05:00
Peter Boyle
0db4f1803f Partial dirichlet support 2022-11-15 16:23:41 -05:00
Peter Boyle
5fe480d81c Generic patch 2022-11-15 16:21:45 -05:00
Peter Boyle
0566fc6267 Partial Dirichlet 2022-11-15 16:21:24 -05:00
Peter Boyle
a11c12e2e7 Modifications for partial dirichlet BCs 2022-11-15 16:20:01 -05:00
Peter Boyle
0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
Peter Boyle
7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
Peter Boyle
5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
Peter Boyle
1873101362 PVC 2022-11-08 13:22:45 -08:00
Peter Boyle
63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
Peter Boyle
bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
Peter Boyle
82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
Peter Boyle
006268f556 DWF Slow version 2022-11-02 20:24:51 -04:00
Peter Boyle
78acae9b50 Simple DWF for easy check 2022-11-02 20:24:17 -04:00
Peter Boyle
a3927a8a27 Dirichlet 2022-11-02 20:22:27 -04:00
Peter Boyle
d9dd9a5b5f LLVM update 2022-11-02 19:51:50 -04:00
Peter Boyle
eae1c02111 Bounds check 2022-11-02 19:50:32 -04:00
Peter Boyle
132d841b05 Compile fix 2022-11-02 19:33:22 -04:00
Peter Boyle
62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
Peter Boyle
2e8c3b0ddb Slow implementation of Shamir DWF 2022-10-18 18:10:01 -04:00
Peter Boyle
991667ba5e Revert 2022-10-13 18:50:35 -04:00
Peter Boyle
8a07b52009 Dirichlet 2022-10-13 18:44:47 -04:00
Peter Boyle
2bcff94b52 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-13 18:42:04 -04:00
Peter Boyle
d089739e2f Hack for lattice sites 2022-10-13 17:55:50 -04:00
Peter Boyle
204c283e16 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-11 14:59:07 -04:00
Peter Boyle
551a5f8dc8 RRII gpu option 2022-10-11 14:44:55 -04:00
Peter Boyle
c82b164f6b Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-04 17:41:48 -04:00
Peter Boyle
584a3ee45c Merge pull request #412 from giltirn/patch/adaptive-wflow
Patch/adaptive wflow
2022-10-04 17:23:19 -04:00
Peter Boyle
eec0c9eb7d Merge pull request #411 from giltirn/patch/dirichlet-fixes
Various fixes / changes
2022-10-04 17:22:01 -04:00
Peter Boyle
477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
Peter Boyle
0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
Peter Boyle
413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
Peter Boyle
03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
Peter Boyle
e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
Peter Boyle
9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
Christopher Kelly
66d001ec9e Refactored Wilson flow class; previously the class implemented both iterative and adaptive smearing, but only the iterative method was accessible through the Smearing base class. The implementation of Smearing also forced a clunky need to pass iterative smearing parameters through the constructor but adaptive smearing parameters through the function call. Now there is a WilsonFlowBase class that implements common functionality, and separate WilsonFlow (iterative) and WilsonFlowAdaptive (adaptive) classes, both of which implement Smearing virtual functions.
Modified the Wilson flow adaptive smearing step size update to implement the original Ramos definition of the distance, where previously it used the norm of a difference which scales with the volume and so would choose too coarse or too fine steps depending on the volume. This is based on Chulwoo's code.

Added a test comparing adaptive (with tuneable tolerance) to iterative Wilson flow smearing on a random gauge configuration.
2022-10-03 10:59:38 -04:00
Peter Boyle
fad2f969d9 Summit up to date 2022-09-27 10:58:43 -04:00
Peter Boyle
48165c1dc1 Ticked off a few items 2022-09-27 10:58:00 -04:00
Peter Boyle
25df2d2c3b Various precision options 2022-09-27 10:57:12 -04:00
Peter Boyle
af9ecb8b41 Current tests compiling 2022-09-27 10:56:55 -04:00
Peter Boyle
234324599e Double2 2022-09-27 10:56:10 -04:00
Peter Boyle
97448a93dc Double2 compiles and dslash runs 2022-09-27 10:55:25 -04:00
Peter Boyle
70c83ec3be More instantiations 2022-09-27 10:54:23 -04:00
Peter Boyle
8f4e2ee545 Double2 2022-09-27 10:53:46 -04:00
Peter Boyle
e8bfbf2f7c D2 operators 2022-09-27 10:37:45 -04:00
Peter Boyle
9e81b42981 D2 fields 2022-09-27 10:37:19 -04:00
Peter Boyle
6c9eef9726 D2 fields 2022-09-27 10:36:54 -04:00
Peter Boyle
7ffbc3e98e Double2 improved. REally don't like 'convertType' - localise to a GPT
header
2022-09-27 10:35:31 -04:00
Peter Boyle
68e4d833dd Run through wrapper script 2022-09-23 16:49:29 -04:00
Peter Boyle
a2cefaa53a Faster 2022-09-23 16:49:14 -04:00
Peter Boyle
a0d682687e Better logging of Fdt for force gradient 2022-09-23 16:22:53 -04:00
Peter Boyle
eb552c3ecd dt info 2022-09-23 16:22:28 -04:00
Peter Boyle
97cce103d7 Tolerances control 2022-09-23 16:21:49 -04:00
Peter Boyle
87ac7104f8 Prettier 2022-09-23 16:20:46 -04:00
Peter Boyle
e4c117aabf Compile fix, multishift mixed prec support 2022-09-23 16:19:27 -04:00
Peter Boyle
5b128a6f9f MixedPrec Multishift with better precision scheme for GPU 2022-09-23 16:18:47 -04:00
Christopher Kelly
19da647e3c Added support for non-periodic gauge field implementations in the random gauge shift performed at the start of the HMC trajectory
(The above required exposing the gauge implementation to the HMC class through the Integrator class)
Made the random shift optional (default on) through a parameter in HMCparameters
Modified ConjugateBC::CshiftLink such that it supports any shift in  -L < shift < L rather than just +-1
Added a tester for the BC-respecting Cshift
Fixed a missing system header include in SSE4 intrinsics wrapper
Fixed sumD_cpu for single-prec types performing an incorrect conversion to a single-prec data type at the end, that fails to compile on some systems
2022-09-09 12:47:09 -04:00
Peter Boyle
1713de35c0 Improved config flags 2022-09-05 21:50:02 -04:00
Peter Boyle
1177b8f661 Merge branch 'develop' into feature/dirichlet 2022-08-31 19:05:57 -04:00
Peter Boyle
442bfb3d42 Merge branch 'develop' into feature/dirichlet 2022-08-31 19:04:19 -04:00
Peter Boyle
e7d9b75fdd Warning fixes 2022-08-31 19:01:14 -04:00
Peter Boyle
3d0e3ec363 Tracing 2022-08-31 18:31:46 -04:00
Peter Boyle
3c1c51f9aa Merge branch 'feature/dirichlet-gparity' into feature/dirichlet 2022-08-31 18:25:34 -04:00
Peter Boyle
8cc3c522c3 Merge pull request #409 from giltirn/feature/dirichlet-gparity-stage
Import round 5
2022-08-31 18:22:50 -04:00
Peter Boyle
913fbca74a Merge pull request #410 from gkanwar/photon_and_sha_patches
Photon.h and SHA256 patches
2022-08-31 18:01:45 -04:00
Peter Boyle
5c87342108 Used in g-2 sign off 2022-08-31 17:35:32 -04:00
Peter Boyle
66177bfbe2 Used in g-2 sign off 2022-08-31 17:35:07 -04:00
Peter Boyle
5205e68963 RocTX, NVTX, text based self profiling 2022-08-31 17:34:09 -04:00
Peter Boyle
cd5cf6d614 Tracing replaces self timing hooks 2022-08-31 17:33:41 -04:00
Peter Boyle
5abb19eab0 Remove self timing 2022-08-31 17:32:49 -04:00
Peter Boyle
06d7b88c78 Force reporting improved 2022-08-31 17:32:21 -04:00
Peter Boyle
cf72799735 Better action naming 2022-08-31 17:24:11 -04:00
Peter Boyle
cdb8fcc269 Width=4 support. This is too broad; hit it on physical point run.
Need to change strategy, I think.
2022-08-31 17:21:33 -04:00
Peter Boyle
b4f4130901 Defer SMP node links until after interior. Allows for DMA overlapping
compute
2022-08-31 17:20:21 -04:00
Peter Boyle
bb049847d5 Tracing replaces self timing 2022-08-31 17:19:02 -04:00
Peter Boyle
fd33c835dd Feynman rule fix and tracing replaces self timing 2022-08-31 17:18:17 -04:00
Peter Boyle
21371a7e5b Tracing replaces self timing 2022-08-31 17:16:05 -04:00
Peter Boyle
abfaa00d3e Tracing replaces self timing 2022-08-31 17:15:24 -04:00
Peter Boyle
efee33c55d Tracing replaces self timing 2022-08-31 17:14:57 -04:00
Peter Boyle
db0fe6ddbb Tracing replaces self timinng 2022-08-31 17:14:14 -04:00
Peter Boyle
8a9e647120 Tracing replaces self timing 2022-08-31 17:13:44 -04:00
Peter Boyle
e6dcb821ad Tracing replaces self timing 2022-08-31 17:12:31 -04:00
Peter Boyle
9bff188f02 Tracing replaces self timing 2022-08-31 17:12:05 -04:00
Peter Boyle
111b30ca1d Tracing replaces self timing 2022-08-31 17:11:48 -04:00
Peter Boyle
24182ca8bf HIP allows conserved currents.
Tracing replaces self timeing
2022-08-31 17:11:18 -04:00
Peter Boyle
ee2d7369b3 Tracing replaces self timing 2022-08-31 17:10:45 -04:00
Peter Boyle
7c686d29c9 Tracing replaces self timing 2022-08-31 17:10:17 -04:00
Peter Boyle
e8a0a1e75d Tracing replaces self timing hooks 2022-08-31 17:09:47 -04:00
Peter Boyle
730be89abf Remove timing hooks as tracing replaces 2022-08-31 17:08:44 -04:00
Peter Boyle
f991ad7d5c Remove timing hooks as tracing replaces 2022-08-31 17:08:18 -04:00
Peter Boyle
b3f33f82f7 Decrease self timing hooks, use nvtx / roctx type tracing hooks instead 2022-08-31 17:06:47 -04:00
Peter Boyle
a34a6e059f Logging improvement. Sinitial will be used to improve RHMC terms 2022-08-31 17:06:08 -04:00
Peter Boyle
1333319941 Tracing 2022-08-31 17:00:25 -04:00
Peter Boyle
9295ed8d20 Print full memory range 2022-08-31 16:59:51 -04:00
Peter Boyle
19cc7653fb Tracing 2022-08-31 16:57:51 -04:00
Peter Boyle
5752538661 Tracing 2022-08-31 16:57:32 -04:00
Peter Boyle
ca40a1b00b Tracing 2022-08-31 16:54:55 -04:00
Peter Boyle
659fac9dfb Tracing hook 2022-08-31 16:54:25 -04:00
Peter Boyle
4dc3d6fce0 Buy into Nvidia/Rocm etc... tracing. 2022-08-31 16:53:19 -04:00
Gurtej Kanwar
60dfb49afa Remove FP16 tests when FP16 is disabled 2022-08-21 17:29:55 +02:00
Gurtej Kanwar
554c238359 Update OpenSSL digest to use high-level methods
This avoids deprecation warnings when compiling against OpenSSL 3.0
but should still be backwards compatible. It is the recommended way
to use the digest API going forward.
2022-08-21 17:28:57 +02:00
Gurtej Kanwar
f922adf05e Fix Photon ComplexField type 2022-08-21 16:16:18 +02:00
Peter Boyle
95b640cb6b 10TF/s on 32^3 x 64 on single node 2022-08-04 15:43:52 -04:00
Peter Boyle
2cb5bedc15 Copy stream HIP improvements 2022-08-04 15:24:03 -04:00
Peter Boyle
806b02bddf Simplify dead code 2022-08-04 15:23:13 -04:00
Peter Boyle
de40395773 More timing. Think I should start to use nvtx and rocmtx ?? 2022-08-04 13:37:16 -04:00
Peter Boyle
7ba4788715 Fix 2022-08-04 13:36:44 -04:00
Peter Boyle
06d9ce1a02 Synch ranks on node here for GPU - GPU memcopy 2022-08-04 13:35:56 -04:00
Peter Boyle
75bb6b2b40 Move barrier into the StencilSend begin routine 2022-08-04 13:35:26 -04:00
Peter Boyle
74f10c2dc0 Move barrier into Stencil Send 2022-08-04 13:34:11 -04:00
Peter Boyle
188d2c7a4d PVC default, ignore ATS 2022-08-02 08:38:53 -07:00
Peter Boyle
17d7177105 Files for SYCL 2022-08-02 08:33:39 -07:00
Peter Boyle
bb0a0da47a inon blocking caution due to SYCL 2022-08-02 08:09:43 -07:00
Peter Boyle
84110166e4 Fix the fence 2022-08-02 08:00:43 -07:00
Peter Boyle
d32b923b6c Fencing on a stream in SYCL is needed. Didn't know that ... gulp 2022-08-02 07:58:04 -07:00
Peter Boyle
a93d5459d4 Better mpi request completion 2022-07-28 12:18:35 -04:00
Peter Boyle
9c21add0c6 High res timer replaces getttimeofday 2022-07-28 12:14:03 -04:00
Peter Boyle
639aab6563 High res timer instead of gettimeofday 2022-07-28 12:13:35 -04:00
Peter Boyle
8137cc7049 Allways concurrent comms 2022-07-28 12:01:51 -04:00
Peter Boyle
60e63dca1d Add memory logging channel 2022-07-28 11:39:15 -04:00
Peter Boyle
486409574e Expanded cach to avoid any allocs in HMC 2022-07-28 11:38:34 -04:00
Peter Boyle
a913b8be12 Dslash self timing. Might want to not have this 2022-07-28 11:37:55 -04:00
Peter Boyle
2239751850 Better logging 2022-07-28 11:37:36 -04:00
Peter Boyle
9b20f1449c Better timing 2022-07-28 11:37:12 -04:00
Peter Boyle
b99453083d Updated timing 2022-07-28 11:37:02 -04:00
Peter Boyle
2ab1af5754 Ensure no synchronize and not optoin dependent 2022-07-19 09:51:06 -07:00
Peter Boyle
5f8892bf03 Mistake pointed out by Camilo 2022-07-19 09:31:51 -07:00
Peter Boyle
f14e7e51e7 Grid accelerator 2022-07-12 10:56:22 -07:00
Peter Boyle
943fbb914d Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-07-11 13:48:42 -04:00
Peter Boyle
ca4603580d Verbose 2022-07-11 13:48:35 -04:00
Peter Boyle
f73db8f1f3 Synch clocks 2022-07-11 13:47:39 -04:00
Peter Boyle
f7217d12d2 World barrier for clock synch 2022-07-11 13:45:31 -04:00
Peter Boyle
fab50c57d9 More loggin 2022-07-11 18:42:27 +01:00
Peter Boyle
3440534fbf MixedPrec support 2022-07-10 21:35:18 +01:00
Peter Boyle
177b1a7ec6 Mixed prec 2022-07-10 21:34:10 +01:00
Peter Boyle
58182fe345 Different approach to default dirichlet params 2022-07-10 21:32:58 +01:00
Peter Boyle
1f907d330d Different default params for dirichlet 2022-07-10 21:31:48 +01:00
Peter Boyle
b0fe664e9d Better force log info 2022-07-10 21:31:25 +01:00
Peter Boyle
c0f8482402 Remove SSC marks 2022-07-07 17:49:36 +01:00
Peter Boyle
3544965f54 Stream doesn't work 2022-07-07 17:49:20 +01:00
Christopher Kelly
33e4a0caee Imported changes from feature/gparity_HMC branch:
Rework of WilsonFlow class
		Fixed logic error in smear method where the step index was initialized to 1 rather than 0, resulting in the logged output value of tau being too large by epsilon
		Previously smear_adaptive would maintain the current value of tau as a class member variable whereas smear would compute it separately; now both methods maintain the current value internally and it is updated by the evolve_step routines. Both evolve methods are now const.
		smear_adaptive now also maintains the current value of epsilon internally, allowing it to be a const method and also allowing the same class instance to be reused without needing to be reset
		Replaced the fixed evaluation of the plaquette energy density and plaquette topological charge during the smearing with a highly flexible general strategy where the user can add arbitrary measurements as functional objects that are evaluated at an arbitrary frequency
	        By default the same plaquette-based measurements are performed, but additional example functions are provided where the smearing is performed with different choices of measurement that are returned as an array for further processing
		Added a method to compute the energy density using the Cloverleaf approach which has smaller discretization errors
	Added a new tensor utility operation, copyLane, which allows for the copying of a single SIMD lane between two instances of the same tensor type but potentially different precisions
	To LocalCoherenceLanczos, added the option to compute the high/low eval of the fine operator on every restart to aid in tuning the Chebyshev
	Added Test_field_array_io which demonstrates and tests a single-file write of an arbitrary array of fields
	Added Test_evec_compression which generates evecs using Lanczos and attempts to compress them using the local coherence technique
	Added Test_compressed_lanczos_gparity which demonstrates the local coherence Lanczos for G-parity BCs
	Added HMC main programs for the 40ID and 48ID G-parity lattices
2022-07-01 14:12:12 -04:00
Peter Boyle
1f903d9296 Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-07-01 12:12:50 -04:00
Peter Boyle
4df1e0987f Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity 2022-07-01 09:55:43 -04:00
Peter Boyle
588c2f3cb1 Faster axpy_norm and innerProduct 2022-07-01 09:44:58 -04:00
Peter Boyle
bd99fd608c Introduce a non-default stream for compute operatoins 2022-07-01 09:42:53 -04:00
Peter Boyle
57b442d0de Log memory operations 2022-07-01 09:42:17 -04:00
Peter Boyle
751a4562d7 Timing improvement 2022-07-01 09:41:43 -04:00
Peter Boyle
ca66301dee Remove debug 2022-06-30 14:53:12 -04:00
Peter Boyle
808bb59206 Mixed prec DD-RHMC 2022-06-30 13:50:09 -04:00
Peter Boyle
4b7f51d19d Create a new RNG file 2022-06-30 13:49:50 -04:00
Peter Boyle
d03152fac4 New file under debug 2022-06-30 13:49:35 -04:00
Peter Boyle
137f190258 Dirichlet implementation 2022-06-30 13:45:07 -04:00
Peter Boyle
53d01312b3 Rough flop counting, need to add M5D, M5Ddag, MooeeInv flops 2022-06-30 13:44:09 -04:00
Peter Boyle
220050822a Speed up M5D and M5Ddag 2022-06-30 13:43:27 -04:00
Peter Boyle
87ad76d81b Initialise timeval 2022-06-30 13:42:46 -04:00
Peter Boyle
042ab1a052 Update GridStd.h 2022-06-27 13:21:39 -04:00
Peter Boyle
4ac1094856 Updated config commands 2022-06-27 12:16:24 -04:00
Peter Boyle
d44a57b0af Allow frequency=0 to disable 2022-06-27 12:15:55 -04:00
Peter Boyle
dc000d10ee Spelling correction 2022-06-27 12:14:57 -04:00
Peter Boyle
3685f391cf More verbose CG 2022-06-27 12:11:08 -04:00
Peter Boyle
efd7338a00 Allow dirichlet at round the world link 2022-06-27 12:10:27 -04:00
Peter Boyle
e1e7b1e224 RNG fix 2022-06-27 12:09:52 -04:00
Peter Boyle
7319d4e1ad Merge pull request #407 from giltirn/feature/dirichlet-gparity-stage
Import round 4
2022-06-22 15:23:36 -04:00
Christopher Kelly
fd933420c6 Imported changes from feature/gparity_HMC branch:
Added a bounds-check function for the RHMC with arbitrary power
	Added a pseudofermion action for the rational ratio with an arbitrary power and a mixed-precision variant of the same. The existing one-flavor rational ratio class now uses the general class under the hood
	To support testing of the two-flavor even-odd ratio pseudofermion, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field
	Added a new HMC runner start type: CheckpointStartReseed, which reseeds the RNG from scratch, allowing for the creation of new evolution streams from an existing checkpoint. Added log output of seeds used when the RNG is seeded.
	EOFA changes:
		To support mixed-precision inversion, generalized the class to maintain a separate solver for the L and R operators in the heatbath (separate solvers are already implemented for the other stages)
		To support mixed-precision, the action of setting the operator shift coefficients is now maintained in a virtual function. A derived class for mixed-precision solvers ensures the coefficients are applied to both the double and single-prec operators
		The ||^2 of the random source is now stored by the heatbath and compared to the initial action when it is computed. These should be equal but may differ if the rational bounds are not chosen correctly, hence serving as a useful and free test
		Fixed calculation of M_eofa (previously incomplete and #if'd out)
		Added functionality to compute M_eofa^-1 to complement the calculation of M_eofa (both are equally expensive!)
		To support testing, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field
	Added a test program which computes the G-parity force using the 1 and 2 flavor implementations and compares the result. Test supports DWF, EOFA and DSDR actions, chosen by a command line option.
	The Mobius EOFA force test now also checks the rational approximation used for the heatbath
	Added a test program for the mixed precision EOFA compared to the double-prec implementation,
	G-parity HMC test now applied GPBC in the y direction and not the t direction (GPBC in t are no longer supported) and checkpoints after every configuration
	Added a test program which computes the two-flavor G-parity action (via RHMC) with both the 1 and 2 flavor implementations and checks they agree
	Added a test program to check the implementation of M_eofa^{-1}
2022-06-22 10:27:48 -04:00
Peter Boyle
8208a6214f Merge branch 'feature/dirichlet-gparity' into feature/dirichlet 2022-06-15 19:23:48 -04:00
Peter Boyle
3d8146b596 Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity 2022-06-15 19:20:27 -04:00
Peter Boyle
31efa5c4da Script updates for current summit 2022-06-15 19:19:44 -04:00
Peter Boyle
d10d30dda8 Script update 2022-06-15 19:18:58 -04:00
Peter Boyle
0e9666bc92 Test update 2022-06-15 19:18:42 -04:00
Peter Boyle
6efd80f104 Printing 2022-06-15 18:23:46 -04:00
Peter Boyle
fdef7a1a8c Dirichlet fix 2022-06-15 00:05:20 -04:00
Peter Boyle
501bb117bf Const correct 2022-06-15 00:04:09 -04:00
Peter Boyle
05ca7dc252 Const correctness 2022-06-14 23:41:05 -04:00
Peter Boyle
e9648a1635 Useful periodic print. CG convergence bound is remarkably accurate on
low eigenvalue in numerical tests
2022-06-14 23:40:04 -04:00
Peter Boyle
2df98a99bc Merge pull request #406 from giordano/patch-1
Update default value of gen-simd-width in README
2022-06-14 17:46:25 -04:00
Mosè Giordano
315ea18be2 Update default value of gen-simd-width in README 2022-06-14 22:41:05 +01:00
Peter Boyle
9a9f4a111f Merge pull request #405 from giltirn/feature/dirichlet-gparity-stage
Import round 3
2022-06-06 18:45:37 -04:00
Christopher Kelly
1ad54d049d To PeriodicBC and ConjugateBC, added a new function "CshiftLink" which performs a boundary-aware C-shift of links or products of links. For the latter, the links crossing the global boundary are complex-conjugated.
To the gauge implementations, added CshiftLink functions calling into the appropriate operation for the BC in a given direction.
GaugeTransform, FourierAcceleratedGaugeFixer and WilsonLoops::FieldStrength no longer implicitly assume periodic boundary conditions; instead the shifted link is obtained using CshiftLink and is aware of the gauge implementation.
Added an assert-check to ensure that the gauge fixing converges within the specified number of steps.
Added functionality to compute the timeslice averaged plaquette
Added functionality to compute the 5LI topological charge and timeslice topological charge
Added a check of the properties of the charge conjugation matrix C=-gamma_2 gamma_4 to Test_gamma
Fixed const correctness for Replicate
Modified Test_fft_gfix to support either conjugate or periodic BCs, optionally disabling Fourier-accelerated gauge fixing, and tuning of alpha using cmdline options
2022-06-02 15:30:41 -04:00
Peter Boyle
57bd0a0a22 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-06-01 19:29:38 -04:00
Peter Boyle
b49db84b08 Slurm updates 2022-06-01 19:27:42 -04:00
Peter Boyle
583f7c52f3 SSC mark 2022-06-01 19:27:29 -04:00
Peter Boyle
58a86c9164 SSC mark removal 2022-06-01 19:27:06 -04:00
Peter Boyle
a25b32847f Crusher patch 2022-06-01 19:26:37 -04:00
Peter Boyle
6f1a2e132b SSC mark causing problems 2022-06-01 19:26:06 -04:00
Peter Boyle
b1ede7b46d Faster RNG init 2022-06-01 19:25:42 -04:00
Peter Boyle
e762c940c2 Reduce the loop over exterior for GPU to indirection table 2022-06-01 14:29:25 -07:00
Peter Boyle
6a1a198144 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-29 11:08:09 -04:00
Peter Boyle
34faa39f4f Clean up Dirichlet. Big oops fix 2022-05-28 17:18:08 -07:00
Peter Boyle
5ddea3829d Extra easier signature for peek 2022-05-28 15:52:39 -07:00
Peter Boyle
7eb29cf529 MPI fix 2022-05-28 15:51:34 -07:00
Peter Boyle
f729b9b889 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-25 14:16:09 -04:00
Peter Boyle
4f997c5f04 Remove extra face kernels in Dirichlet 2022-05-25 11:15:25 -07:00
Peter Boyle
a9c2e1df03 Merge pull request #404 from rrhodgson/feature/json_nvcc
Feature/json nvcc
2022-05-25 13:30:11 -04:00
Peter Boyle
d3496d2fe0 Merge pull request #397 from giltirn/feature/dirichlet-gparity-stage
Gparity HMC import round 2
2022-05-25 13:29:45 -04:00
Peter Boyle
60f4cb0ffd Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-25 12:38:10 -04:00
Peter Boyle
136d843ce7 Crusher updates 2022-05-25 12:36:09 -04:00
Peter Boyle
18028f4309 Merge branch 'develop' into feature/dirichlet 2022-05-24 18:26:18 -07:00
Peter Boyle
5164016740 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-05-24 18:25:57 -07:00
Peter Boyle
d83beaa890 Update perlmutter 2022-05-24 18:25:00 -07:00
Peter Boyle
f9f05e995b Update perlmutter 2022-05-24 18:24:38 -07:00
Peter Boyle
e651b9e7ab Clean up stencil with better intranode Dirichlet / DDHMC support.
14TF/s on a Perlmutter node
2022-05-24 18:23:39 -07:00
Peter Boyle
47b4e91473 Verbose change 2022-05-24 18:19:18 -07:00
Peter Boyle
3f31afa4fc Clean up verbose 2022-05-24 18:18:51 -07:00
da4daea57a Updated json to latest release 3.10.5 2022-05-24 16:16:06 +01:00
Peter Boyle
af3b065add Merge pull request #403 from fjosw/fix/cuda_11_5_warnings
Fixed nvcc 11.5+ warnings
2022-05-24 11:10:02 -04:00
e346154c5d Updated json CUDA compile guards 2022-05-24 15:48:01 +01:00
7937ac2bab fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc 2022-05-24 15:31:03 +01:00
e909aeedf0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h 2022-05-24 15:29:42 +01:00
bab8aa8eb0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT
standard in DisableWarnings.h
2022-05-24 15:27:40 +01:00
Peter Boyle
38b22f05be Merge pull request #402 from fjosw/fix/clover_warnings
fixed clover warnings
2022-05-24 10:05:27 -04:00
3ca0de1c40 Fix json write for vector<string> 2022-05-24 14:37:33 +01:00
c7205d2a73 Removed nvcc guards for json 2022-05-24 14:30:26 +01:00
617c5362c1 fix: fixed warning: missing return statement at end of non-void function
in CloverHelpers
2022-05-24 11:37:33 +01:00
Peter Boyle
083b58e66d Merge pull request #401 from JPRichings/LocalCoheranceDeflation
Local coherance batch deflation
2022-05-20 11:44:22 -04:00
Peter Boyle
633427a2df Merge pull request #400 from JPRichings/wilson_sweep
bench wilson sweep fix
2022-05-20 11:43:40 -04:00
2031d6910a Merge branch 'paboyle:develop' into wilson_sweep 2022-05-20 16:20:23 +01:00
Peter Boyle
f82ce67624 Dirichlet improved 2022-05-19 19:17:11 -07:00
Peter Boyle
b52e8ef65a Dirichlet changes 2022-05-19 16:45:41 -07:00
Peter Boyle
2594e3c230 Dirichlet option 2022-05-19 16:45:19 -07:00
Peter Boyle
8cedb45af2 Dirichlet BCs 2022-05-19 16:45:02 -07:00
Peter Boyle
aa008cbe99 Updated for new Dirichlet interface 2022-05-19 16:44:39 -07:00
79e34b3eb4 Local Coherence batch deflation 2022-05-19 14:53:17 +01:00
4f3d581ab4 Merge branch 'paboyle:develop' into LocalCoheranceDeflation 2022-05-19 14:46:17 +01:00
Peter Boyle
6fb6ca5b6b Merge branch 'develop' into feature/dirichlet 2022-05-17 09:09:00 -07:00
Peter Boyle
b8ee19691c Updated config for PM 2022-05-17 09:08:12 -07:00
Peter Boyle
d16427b837 Merge pull request #399 from fjosw/fix/Nc_neq_3
fix: assert for dimensions of compact Wilson clover moved to constructor
2022-05-17 09:03:42 -04:00
4b1997e2f3 wilson sweep test 2022-05-16 15:58:33 +01:00
8939d5dc73 bugfix: eo operator called in correct location 2022-05-16 00:28:28 +01:00
b051e00de0 Additional Local Coherance Deflation operator() 2022-05-16 00:25:13 +01:00
8aa75b492f Merge branch 'develop' into fix/Nc_neq_3 2022-05-10 14:22:03 +01:00
Peter Boyle
0274f40686 Merge pull request #389 from mbruno46/mbruno-eclover
Feature/expClover
2022-05-10 09:18:19 -04:00
Peter Boyle
77aa147ce5 Merge branch 'develop' into mbruno-eclover 2022-05-10 09:16:53 -04:00
32facbd02a fix: assert for dimensions of compact Wilson clover moved to
constructor.
2022-05-10 10:53:22 +01:00
Christopher Kelly
6121397587 Imported changes from feature/gparity_HMC branch:
Added storage of final true residual in mixed-prec CG and enhanced log output
	Fixed const correctness of multi-shift constructor
	Added a mixed precision variant of the multi-shift algorithm that uses a single precision operator and applies periodic reliable update to the residual
	Added tests/solver/Test_dwf_multishift_mixedprec to test the above
	Fixed local coherence lanczos using the (large!) max approx to the chebyshev eval as the scale from which to judge the quality of convergence, resulting a test that always passes
	Added a method to local coherence lanczos class that returns the fine eval/evec pair
	Added iterative log output to power method
	Added optional disabling of the plaquette check in Nerscio to support loading old G-parity configs which have a factor of 2 error in the plaquette
	G-parity Dirac op no longer allows GPBC in the time direction; instead we toggle between periodic and antiperiodic
	Replaced thread_for G-parity 5D force insertion implementation with accelerator_for version capable of running on GPUs
	Generalized tests/lanczos/Test_dwf_lanczos to support regular DWF as well as Gparity, with the action chosen by a command line option
	Modified tests/forces/Test_dwf_gpforce,Test_gpdwf_force,Test_gpwilson_force to use GPBC a spatial direction rather than the t-direction, and antiperiodic BCs for time direction
	tests/core/Test_gparity now supports using APBC in time direction using command line toggle
2022-05-09 16:27:57 -04:00
Peter Boyle
4de50ab146 Merge pull request #396 from fjosw/fix/readd_config.h
fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am
2022-05-09 08:26:48 -04:00
8b12a61097 fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am 2022-05-09 11:53:22 +01:00
Peter Boyle
79ea027c0b Merge pull request #377 from RJHudspith/develop
NERSC and ILDG for non-SU(3) configuration checkpoints
2022-05-03 08:55:48 -04:00
Peter Boyle
62339d437f Merge pull request #387 from lehner/feature/gpt
Parity mass terms for domain wall fermions to enable 4d eofa
2022-05-03 08:52:18 -04:00
Peter Boyle
698e745276 Merge pull request #390 from fjosw/feature/conserved_current_wilson
Conserved current for wilson fermions
2022-05-03 08:51:10 -04:00
Peter Boyle
0417b96896 Merge pull request #391 from giltirn/feature/dirichlet-gparity-stage
First stage of import
2022-05-03 08:50:18 -04:00
Peter Boyle
9a6e2c315d Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge
SteepestDescentGaugeFix now exits when the algorithm does not converge.
2022-05-03 08:49:26 -04:00
e61fed87db SteepestDescentGaugeFix now exits when the algorithm does not converge.
This behaviour can be altered by setting err_on_no_converge to false.
2022-04-20 15:41:55 +01:00
Christopher Kelly
81fe4c937e Hopefully fix link errors on Intel compilers due to having no function body for MomentumFilterBase::apply_phase 2022-04-12 09:51:59 -04:00
Christopher Kelly
f77f3a6598 Imported G-parity flavor algebra + tester from feature/gparity_HMC branch 2022-04-06 10:21:04 -04:00
Peter Boyle
239afb18fb Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-04-05 16:49:32 -04:00
Peter Boyle
ef820a26cd Bcopy on crusher compile 2022-04-05 16:49:02 -04:00
Peter Boyle
65abe4d0d3 Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-04-05 16:26:54 -04:00
Peter Boyle
5012adfebf Merge branch 'develop' into feature/dirichlet 2022-04-05 16:26:19 -04:00
Peter Boyle
b808d48fa1 Tone down printing in integrator 2022-04-05 16:25:22 -04:00
Peter Boyle
83f818a99d Updates for DDHMC 2022-04-05 16:24:34 -04:00
b8bc560b51 Test_wilson_conserved_current implemented, all 5d references removed. 2022-04-05 17:33:45 +01:00
6bc2483d57 Merge branch 'feature/eclover' into feature/conserved_current_wilson 2022-04-05 15:26:49 +01:00
82aecbf4cf Test_wilson_conserved_current added 2022-04-05 15:26:39 +01:00
Mattia Bruno
ee23a76aa0 Merge pull request #2 from fjosw/feature/eclover
Feature/eclover
2022-04-05 13:30:13 +02:00
d7191e5a02 SeqConservedCurrent implemented for Wilson fermions 2022-04-05 11:48:56 +01:00
c8a824425b Error message added if another conserved current than vector is requested for
Wilson type fermions.
2022-04-05 10:58:22 +01:00
f23626a6b8 End scope by additional block in CloverHelpers.h 2022-04-02 16:08:15 +01:00
6577a03d16 Explcitly closed views in Exponentiate_Clover 2022-04-01 18:39:12 +01:00
427c8695fe Change signs and prefactors for conserved current to mimic the 5d
version.
2022-04-01 16:20:21 +01:00
9e82c468ab Multiplication of diagonal mass in exponentiate fixed for gpus 2022-04-01 15:54:43 +01:00
603fd96747 Missing link multiplication added. 2022-04-01 10:58:56 +01:00
fe993c0836 /=2 replaced by *=0.5 2022-03-31 17:08:17 +01:00
cdf31d52c1 GaugeGrid and typo fixed 2022-03-31 17:04:35 +01:00
0542eaf1da First version of conserved current contraction for Wilson type quarks 2022-03-31 17:02:09 +01:00
Christoph Lehner
317bdcf158 nerscio parametrization 2022-03-24 13:10:47 +01:00
Peter Boyle
387397374a Current run options 2022-03-23 16:35:11 -04:00
Mattia Bruno
9ca2c98882 Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover 2022-03-22 15:31:37 +01:00
Peter Boyle
605cf401e1 Merge branch 'feature/sumd-npr' into develop 2022-03-16 22:43:12 +00:00
Peter Boyle
f99c3660d2 Merge branch 'feature/cpu-threaded-smp' into develop 2022-03-16 22:07:54 +00:00
Peter Boyle
92a83a9eb3 Performance improve for Tesseract 2022-03-16 17:14:36 +00:00
Mattia Bruno
53ae01a34a Merge pull request #1 from fjosw/feature/eclover
Feature/eclover
2022-03-15 15:23:35 +01:00
Peter Boyle
b615fa0f35 Merge pull request #388 from fjosw/feature/sumd-npr
Feature/sumd npr
2022-03-15 09:05:57 -04:00
Christoph Lehner
76c294a7ba open bc fix 2022-03-08 13:55:16 +01:00
0c0c2b1e20 Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed. 2022-03-08 09:44:51 +00:00
Christoph Lehner
e2fc3a0f04 Merge pull request #28 from paboyle/develop
Sync with Upstream
2022-03-08 09:58:51 +01:00
451e7972fd Reintroduced explicit inversion of the Clover term in case of the
CompactExpClover because of the open boundary O(a) improvement. Changed
the timing output to GridLogDebug
2022-03-07 17:43:33 +00:00
56c089d347 Removed leftover comments 2022-03-07 16:40:20 +00:00
acf740e44d Merge pull request #1 from FelixPGZiegler/feature/eclover
Feature/eclover
2022-03-07 16:25:11 +00:00
182f513404 Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover 2022-03-07 15:22:04 +00:00
d5b2323a57 included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover 2022-03-07 14:44:24 +00:00
FelixPGZiegler
bad18d4417 Merge branch 'paboyle:develop' into feature/eclover 2022-03-07 13:54:10 +00:00
Peter Boyle
bb5c16b97f New scripts 2022-03-03 17:00:37 -05:00
Peter Boyle
0d80eeb545 small DDHMC update 2022-03-03 16:56:02 -05:00
d1decee4cc Cleaned up unused variables in Lattice_reduction_gpu.h 2022-03-02 16:54:23 +00:00
d4ae71b880 sum_gpu_large and sum_gpu templates added. 2022-03-02 15:40:18 +00:00
Peter Boyle
b0f4eee78b New files 2022-03-01 19:09:13 -05:00
Peter Boyle
5340e50427 HMC running with new formulation 2022-03-01 17:10:25 -05:00
Peter Boyle
e16fc5b2e4 Threaded intranode comms transfer - ideally between NUMA domains 2022-03-01 11:17:24 -05:00
Peter Boyle
694306f202 Configure for mac arm 2022-03-01 10:53:44 -05:00
Peter Boyle
9aac1e6d64 Merge branch 'develop' into feature/sumd-npr 2022-03-01 10:51:38 -05:00
Peter Boyle
3e882f555d Large / small sumD options 2022-03-01 08:54:45 -05:00
438caab25f generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed. 2022-02-27 18:27:18 +00:00
239e2c1ee6 tests: wilson clover cg tests now include compact variant as well as
exponential wilson clover operators
2022-02-27 18:26:34 +00:00
013dc2ef33 tests: core tests for wilson clover and wilson exp clover including
compact version extended/added
2022-02-27 18:13:47 +00:00
Christoph Lehner
9616811c3d Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2022-02-24 22:03:05 +01:00
Christoph Lehner
8a3002c03b separate left and right masses for CayleyFermion5D 2022-02-24 22:02:56 +01:00
Peter Boyle
0f1c5b08a1 Dirichlet filters running on AMD and now integrated in Fermion op 2022-02-23 19:29:28 -05:00
Peter Boyle
70988e43d2 Passes multinode dirichlet test with boundaries at
node boundary or at the single rank boundary
2022-02-23 01:42:14 -05:00
Mattia Bruno
71034f828e attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig 2022-02-23 01:02:27 +01:00
Peter Boyle
aab3bcb46f Dirichlet first cut - wrong answers on dagger multiply.
Struggling to get a compute node so changing systems
2022-02-22 19:58:33 +00:00
Mattia Bruno
11437930c5 cleaned up definitions of wilsonclover fermions 2022-02-22 10:45:16 +01:00
Mattia Bruno
3d44aa9cb9 cleaned up cloverhelpers; fixed test compact_clover which runs 2022-02-22 01:10:19 +01:00
Mattia Bruno
2851870d70 expClover support via helpers template class 2022-02-22 00:05:43 +01:00
Peter Boyle
da06d15f73 Merge branch 'feature/feature/staggered-comms' into develop 2022-02-17 04:58:50 +00:00
Peter Boyle
e8b1251b8c Staggered fix finished 2022-02-17 04:51:13 +00:00
Peter Boyle
fad5a74a4b Bug fix to detection case 2022-02-15 10:27:39 -05:00
Peter Boyle
e83f6a6ae9 Merge branch 'develop' into feature/feature/staggered-comms 2022-02-15 08:52:39 -05:00
Azusa Yamaguchi
6283d11d50 Add the comment line to tell the existance of copied data/buffer 2022-02-08 15:22:06 +00:00
Peter Boyle
6616d5d090 Commit 2022-02-02 16:38:24 -05:00
RJHudspith
0bd83cdbda Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled. 2021-11-28 21:51:03 +01:00
Alessandro Lupo
88bdd4344b 2indx antisymm representation of sp2n 2021-11-04 18:27:35 +00:00
Peter Boyle
42d56ea6b6 Verbosity 2021-10-29 02:23:08 +01:00
Peter Boyle
0b905a72dd Better reduction for GPUs 2021-10-29 02:22:22 +01:00
Alessandro Lupo
4044536eea add projection on sp2n algebra 2021-10-26 10:20:44 +01:00
Alessandro Lupo
4d8ae6221c fix projection 2021-10-22 10:44:54 +01:00
Alessandro Lupo
4e31e4e094 Better tests 2021-10-13 15:07:23 +01:00
Alessandro Lupo
0d6674e489 hot start for sp2n 2021-10-12 18:53:54 +01:00
Alessandro Lupo
b145fd4f5b necessary to merge 2021-10-12 17:08:46 +01:00
Alessandro Lupo
8a5b794f25 necessary change to merge with upstrm 2021-10-12 16:04:03 +01:00
Alessandro Lupo
291e80f88a sp2n as config option 2021-10-12 16:00:32 +01:00
Alessandro Lupo
1ace5850ae first hmc 2021-10-12 16:00:32 +01:00
Alessandro Lupo
283f14b7c1 fix sp2n projection 2021-10-12 16:00:32 +01:00
Alessandro Lupo
1d6e708083 tests! 2021-10-12 16:00:32 +01:00
Alessandro Lupo
89457e25e3 sp fermion instantiation 2021-10-12 16:00:32 +01:00
Alessandro Lupo
7e3b298d3d project on sp2n 2021-10-12 16:00:32 +01:00
Alessandro Lupo
7ff3e5eed4 gauge and fermion implementation for sp2n 2021-10-12 16:00:32 +01:00
Alessandro Lupo
19eb51cf41 sp2n generators 2021-10-12 15:53:33 +01:00
Alessandro Lupo
470d4dcc6d sp2n as config option 2021-10-12 15:47:56 +01:00
Alessandro Lupo
ed03bfd555 first hmc 2021-10-12 12:16:47 +01:00
Alessandro Lupo
8c0fbcccae fix sp2n projection 2021-10-12 12:12:16 +01:00
Alessandro Lupo
d4866157fe tests! 2021-10-12 09:06:15 +01:00
Alessandro Lupo
b6496b6cb5 sp fermion instantiation 2021-10-11 16:32:10 +01:00
Alessandro Lupo
4f5fe57920 project on sp2n 2021-10-11 16:28:15 +01:00
Alessandro Lupo
11fb943b1e gauge and fermion implementation for sp2n 2021-10-11 16:21:25 +01:00
Alessandro Lupo
046a23121e sp2n generators 2021-10-05 15:51:22 +01:00
702 changed files with 72093 additions and 22602 deletions

54
.github/ISSUE_TEMPLATE/bug-report.yml vendored Normal file
View File

@@ -0,0 +1,54 @@
name: Bug report
description: Report a bug.
title: "<insert title>"
labels: [bug]
body:
- type: markdown
attributes:
value: >
Thank you for taking the time to file a bug report.
Please check that the code is pointing to the HEAD of develop
or any commit in master which is tagged with a version number.
- type: textarea
attributes:
label: "Describe the issue:"
description: >
Describe the issue and any previous attempt to solve it.
validations:
required: true
- type: textarea
attributes:
label: "Code example:"
description: >
If relevant, show how to reproduce the issue using a minimal working
example.
placeholder: |
<< your code here >>
render: shell
validations:
required: false
- type: textarea
attributes:
label: "Target platform:"
description: >
Give a description of the target platform (CPU, network, compiler).
Please give the full CPU part description, using for example
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
the `--version` option of your compiler.
validations:
required: true
- type: textarea
attributes:
label: "Configure options:"
description: >
Please give the exact configure command used and attach
`config.log`, `grid.config.summary` and the output of `make V=1`.
render: shell
validations:
required: true

4
.gitignore vendored
View File

@@ -1,3 +1,7 @@
# Doxygen stuff
html/*
latex/*
# Compiled Object files #
#########################
*.slo

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -0,0 +1,5 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -34,9 +34,6 @@ directory
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
#if defined __GNUC__
#pragma GCC diagnostic ignored "-Wpsabi"
#endif
//disables and intel compiler specific warning (in json.hpp)
@@ -47,14 +44,22 @@ directory
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC

View File

@@ -44,9 +44,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
//#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h>
#include <Grid/perfmon/Tracing.h>
#include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/threads/ThreadReduction.h>
@@ -58,6 +59,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice.h>
#include <Grid/cshift/Cshift.h>
#include <Grid/stencil/Stencil.h>
#include <Grid/stencil/GeneralLocalStencil.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)

View File

@@ -16,6 +16,7 @@
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>

View File

@@ -14,7 +14,11 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
#endif
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
@@ -30,7 +34,7 @@
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif

View File

@@ -66,6 +66,10 @@ if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
if BUILD_SP
extra_sources+=$(SP_FERMION_FILES)
extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a

View File

@@ -30,9 +30,14 @@ directory
#include <type_traits>
#include <cassert>
#include <exception>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@@ -29,6 +29,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H
NAMESPACE_CHECK(blas);
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h>
@@ -44,7 +47,11 @@ NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h>
@@ -55,6 +62,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -65,11 +73,13 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/SimpleLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/iterative/AdefGeneric.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/multigrid/MultiGrid.h>
NAMESPACE_CHECK(multigrid);
#include <Grid/algorithms/FFT.h>
#endif

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#ifdef USE_MKL
#if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h>
#else
#include <fftw3.h>
@@ -168,6 +168,7 @@ public:
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl;
assert(0);
#else
conformable(result.Grid(),vgrid);
@@ -190,6 +191,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -213,6 +215,7 @@ public:
else if ( sign == forward ) div = 1.0;
else assert(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -226,6 +229,7 @@ public:
}
// Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
@@ -247,6 +251,7 @@ public:
}
}
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
@@ -269,6 +274,7 @@ public:
usec += timer.useconds();
flops+= flops_call*NN;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
@@ -285,6 +291,7 @@ public:
}
result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif

View File

@@ -103,6 +103,38 @@ public:
_Mat.MdagM(in,out);
}
};
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother
@@ -145,6 +177,44 @@ public:
}
};
////////////////////////////////////////////////////////////////////
// Create a shifted HermOp
////////////////////////////////////////////////////////////////////
template<class Field>
class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
LinearOperatorBase<Field> &_Mat;
RealD _shift;
public:
ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
void Op (const Field &in, Field &out){
HermOp(in,out);
}
void AdjOp (const Field &in, Field &out){
HermOp(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.HermOp(in,out);
out = out + _shift*in;
}
};
////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix
////////////////////////////////////////////////////////////////////
@@ -207,6 +277,38 @@ public:
assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several
@@ -526,6 +628,7 @@ public:
(*this)(Linop,in[k],out[k]);
}
};
virtual ~OperatorFunction(){};
};
template<class Field> class LinearFunction {
@@ -541,6 +644,7 @@ public:
(*this)(in[i], out[i]);
}
}
virtual ~LinearFunction(){};
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {

View File

@@ -45,6 +45,11 @@ public:
M(in,tmp);
Mdag(tmp,out);
}
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;

View File

@@ -59,7 +59,7 @@ public:
RealD diff = hi-lo;
RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
delta*=1.02;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
@@ -90,9 +90,8 @@ public:
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
Coeffs.resize(order,0.0);
Coeffs[order-1] = 1.0;
};
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@@ -132,6 +131,26 @@ public:
Coeffs[j] = s * 2.0/order;
}
};
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
@@ -258,26 +277,12 @@ public:
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
#if 0
auto y_v = y.View();
auto Tn_v = Tn->View();
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#else
axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#endif
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
@@ -292,7 +297,6 @@ public:
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;

View File

@@ -40,7 +40,7 @@ public:
RealD norm;
RealD lo,hi;
MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
RealD approx(RealD x);
void csv(std::ostream &out);
void gnuplot(std::ostream &out);

View File

@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to PRECISION for external use only */
/* Converting everything to ZOLO_PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
}
zolotarev_data* higham(PRECISION epsilon, int n) {
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
#ifdef TEST
#undef ZERO
#define ZERO ((PRECISION) 0)
#define ZERO ((ZOLO_PRECISION) 0)
#undef ONE
#define ONE ((PRECISION) 1)
#define ONE ((ZOLO_PRECISION) 1)
#undef TWO
#define TWO ((PRECISION) 2)
#define TWO ((ZOLO_PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R;
ZOLO_PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
@@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
/* Evaluate the rational approximation R(x) using the partial fraction form */
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> alpha[rdata -> da - 1];
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> beta[0] * x;
ZOLO_PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION T;
ZOLO_PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
PRECISION y;
ZOLO_PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
}
rdata = type == 2
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
? higham((ZOLO_PRECISION) eps, n)
: zolotarev((ZOLO_PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((PRECISION) x, rdata);
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));

View File

@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
#ifndef ZOLO_PRECISION
#define ZOLO_PRECISION double
#endif
#define ZPRECISION PRECISION
#define ZPRECISION ZOLO_PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
@@ -77,8 +77,8 @@ typedef struct {
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
@@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_BEGIN(Grid);
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
NAMESPACE_END(Grid);

File diff suppressed because it is too large Load Diff

View File

@@ -113,6 +113,42 @@ public:
blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard();
};
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
int Nevec = (int)evec_coarse.size();
int Nsrc = (int)src.size();
// make temp variables
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
//Preporcessing
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
guess_coarse[j] = Zero();
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockProject(src_coarse[j],src[j],subspace);
}
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
for (int i=0;i<Nevec;i++)
{
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
const CoarseField & tmp = evec_coarse[i];
for (int j=0;j<Nsrc;j++)
{
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
}
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
};

View File

@@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,513 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSDeflation.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiRHS block projection
Import basis -> nblock x nbasis x (block x internal)
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
=> coarse_(nrhs x nbasis )^block = via batched GEMM
//template<class vobj,class CComplex,int nbasis,class VLattice>
//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
// const VLattice &fineData,
// const VLattice &Basis)
*/
template<class Field>
class MultiRHSBlockProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef Field Fermion;
int nbasis;
GridBase *coarse_grid;
GridBase *fine_grid;
uint64_t block_vol;
uint64_t fine_vol;
uint64_t coarse_vol;
uint64_t words;
// Row major layout "C" order:
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
/*
* in Fortran column major notation (cuBlas order)
*
* Vxb = [v1(x)][..][vn(x)] ... x coarse vol
*
* Fxr = [r1(x)][..][rm(x)] ... x coarse vol
*
* Block project:
* C_br = V^dag F x coarse vol
*
* Block promote:
* F_xr = Vxb Cbr x coarse_vol
*/
deviceVector<scalar> BLAS_V; // words * block_vol * nbasis x coarse_vol
deviceVector<scalar> BLAS_F; // nrhs x fine_vol * words -- the sources
deviceVector<scalar> BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs
RealD blasNorm2(deviceVector<scalar> &blas)
{
scalar ss(0.0);
std::vector<scalar> tmp(blas.size());
acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
for(int64_t s=0;s<blas.size();s++){
ss=ss+tmp[s]*adj(tmp[s]);
}
coarse_grid->GlobalSum(ss);
return real(ss);
}
MultiRHSBlockProject(){};
~MultiRHSBlockProject(){ Deallocate(); };
void Deallocate(void)
{
nbasis=0;
coarse_grid=nullptr;
fine_grid=nullptr;
fine_vol=0;
block_vol=0;
coarse_vol=0;
words=0;
BLAS_V.resize(0);
BLAS_F.resize(0);
BLAS_C.resize(0);
}
void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
{
nbasis=_nbasis;
fine_grid=_fgrid;
coarse_grid=_cgrid;
fine_vol = fine_grid->lSites();
coarse_vol = coarse_grid->lSites();
block_vol = fine_vol/coarse_vol;
words = sizeof(scalar_object)/sizeof(scalar);
BLAS_V.resize (fine_vol * words * nbasis );
}
void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
assert(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
}
uint64_t sz = blas.size();
acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
Coordinate fine_rdimensions = fine_grid->_rdimensions;
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
int64_t bv= block_vol;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
autoView( fineData , vecs[v], AcceleratorRead);
auto blasData_p = &blas[0];
auto fineData_p = &fineData[0];
int64_t osites = fine_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
// One thread per fine site
Coordinate coor_f(_ndimension);
Coordinate coor_b(_ndimension);
Coordinate coor_c(_ndimension);
// Fine site to fine coor
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
int sc;// coarse site
int sb;// block site
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
scalar_object data = extractLane(lane,fineData[sf]);
// BLAS layout address calculation
// words * block_vol * nbasis x coarse_vol
// coarse oSite x block vole x lanes
int64_t site = (lane*osites + sc*bv)*nvec
+ v*bv
+ sb;
// assert(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
*ptr = data;
#ifdef GRID_SIMT
}
#else
}
#endif
});
// std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
// std::cout << " BlockProjector imported vector"<<v<<std::endl;
}
}
void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
{
typedef typename Field::vector_object vobj;
int nvec = vecs.size();
assert(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
}
Coordinate fine_rdimensions = fine_grid->_rdimensions;
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
// std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
int64_t bv= block_vol;
for(int v=0;v<vecs.size();v++){
autoView( fineData , vecs[v], AcceleratorWrite);
auto blasData_p = &blas[0];
auto fineData_p = &fineData[0];
int64_t osites = fine_grid->oSites();
uint64_t lwords = words;
// std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
// std::cout << " lwords is "<<lwords << std::endl;
// std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
// loop over fine sites
accelerator_for(sf,osites,vobj::Nsimd(),{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
#else
for(int lane=0;lane<vobj::Nsimd();lane++) {
#endif
// One thread per fine site
Coordinate coor_f(_ndimension);
Coordinate coor_b(_ndimension);
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
int sc;
int sb;
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
// BLAS layout address calculation
// words * block_vol * nbasis x coarse_vol
int64_t site = (lane*osites + sc*bv)*nvec
+ v*bv
+ sb;
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
scalar_object data = *ptr;
insertLane(lane,fineData[sf],data);
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
}
template<class vobj>
void ImportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
autoView( coarseData , vecs[v], AcceleratorRead);
auto blasData_p = &blas[0];
auto coarseData_p = &coarseData[0];
int64_t osites = coarse_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
// C_br per site
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
coarse_scalar_object data = extractLane(lane,coarseData[sc]);
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
*ptr = data;
#ifdef GRID_SIMT
}
#else
}
#endif
});
// std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
}
}
template<class vobj>
void ExportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
// std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
autoView( coarseData , vecs[v], AcceleratorWrite);
auto blasData_p = &blas[0];
auto coarseData_p = &coarseData[0];
int64_t osites = coarse_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
coarse_scalar_object data = *ptr;
insertLane(lane,coarseData[sc],data);
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
}
void ImportBasis(std::vector < Field > &vecs)
{
// std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
ImportFineGridVectors(vecs,BLAS_V);
}
template<class cobj>
void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
/////////////////////////////////////////////
// Copy in the multi-rhs sources to same data layout
/////////////////////////////////////////////
// std::cout << "BlockProject import fine"<<std::endl;
ImportFineGridVectors(fine,BLAS_F);
deviceVector<scalar *> Vd(coarse_vol);
deviceVector<scalar *> Fd(coarse_vol);
deviceVector<scalar *> Cd(coarse_vol);
// std::cout << "BlockProject pointers"<<std::endl;
for(int c=0;c<coarse_vol;c++){
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
acceleratorPut(Vd[c],Vh);
acceleratorPut(Fd[c],Fh);
acceleratorPut(Cd[c],Ch);
}
GridBLAS BLAS;
// std::cout << "BlockProject BLAS"<<std::endl;
int64_t vw = block_vol * words;
/////////////////////////////////////////
// C_br = V^dag R
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw,
scalar(1.0),
Vd,
Fd,
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl;
ExportCoarseGridVectors(coarse, BLAS_C);
// std::cout << "BlockProject done"<<std::endl;
}
template<class cobj>
void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
ImportCoarseGridVectors(coarse, BLAS_C);
GridBLAS BLAS;
deviceVector<scalar *> Vd(coarse_vol);
deviceVector<scalar *> Fd(coarse_vol);
deviceVector<scalar *> Cd(coarse_vol);
for(int c=0;c<coarse_vol;c++){
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
acceleratorPut(Vd[c],Vh);
acceleratorPut(Fd[c],Fh);
acceleratorPut(Cd[c],Ch);
}
/////////////////////////////////////////
// Block promote:
// F_xr = Vxb Cbr (x coarse_vol)
/////////////////////////////////////////
int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis,
scalar(1.0),
Vd,
Cd,
scalar(0.0), // wipe out C
Fd);
BLAS.synchronise();
// std::cout << " blas call done"<<std::endl;
ExportFineGridVectors(fine, BLAS_F);
// std::cout << " exported "<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,233 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSDeflation.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs projection
i) MultiRHS Deflation
Import Evecs -> nev x vol x internal
Import vector of Lattice objects -> nrhs x vol x internal
=> Cij (nrhs x Nev) via GEMM.
=> Guess (nrhs x vol x internal) = C x evecs (via GEMM)
Export
ii) MultiRHS block projection
Import basis -> nblock x nbasis x (block x internal)
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
=> coarse_(nrhs x nbasis )^block = via batched GEMM
iii) Alternate interface:
Import higher dim Lattice object-> vol x nrhs layout
*/
template<class Field>
class MultiRHSDeflation
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
int nev;
std::vector<RealD> eval;
GridBase *grid;
uint64_t vol;
uint64_t words;
deviceVector<scalar> BLAS_E; // nev x vol -- the eigenbasis (up to a 1/sqrt(lambda))
deviceVector<scalar> BLAS_R; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_G; // nrhs x vol -- the guess
deviceVector<scalar> BLAS_C; // nrhs x nev -- the coefficients
MultiRHSDeflation(){};
~MultiRHSDeflation(){ Deallocate(); };
void Deallocate(void)
{
nev=0;
grid=nullptr;
vol=0;
words=0;
BLAS_E.resize(0);
BLAS_R.resize(0);
BLAS_C.resize(0);
BLAS_G.resize(0);
}
void Allocate(int _nev,GridBase *_grid)
{
nev=_nev;
grid=_grid;
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
eval.resize(nev);
BLAS_E.resize (vol * words * nev );
std::cout << GridLogMessage << " Allocate for "<<nev<<" eigenvectors and volume "<<vol<<std::endl;
}
void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
assert(ev<eval.size());
eval[ev] = _eval;
int64_t offset = ev*vol*words;
autoView(v,evec,AcceleratorRead);
acceleratorCopyDeviceToDevice(&v[0],&BLAS_E[offset],sizeof(scalar_object)*vol);
}
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval)
{
ImportEigenBasis(evec,_eval,0,evec.size());
}
// Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{
assert(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid());
// Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1
for(int e=0;e<nev;e++){
std::cout << "Importing eigenvector "<<e<<" evalue "<<_eval[_ev0+e]<<std::endl;
ImportEigenVector(evec[_ev0+e],_eval[_ev0+e],e);
}
}
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{
int nrhs = source.size();
assert(source.size()==guess.size());
assert(grid == guess[0].Grid());
conformable(guess[0],source[0]);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nev * nrhs);// cost free if size doesn't change
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
// for(int r=0;r<nrhs;r++){
// std::cout << " source["<<r<<"] = "<<norm2(source[r])<<std::endl;
// }
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(v,source[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&v[0],&BLAS_R[offset],sizeof(scalar_object)*vol);
}
/*
* in Fortran column major notation (cuBlas order)
*
* Exe = [e1(x)][..][en(x)]
*
* Rxr = [r1(x)][..][rm(x)]
*
* C_er = E^dag R
* C_er = C_er / lambda_e
* G_xr = Exe Cer
*/
deviceVector<scalar *> Ed(1);
deviceVector<scalar *> Rd(1);
deviceVector<scalar *> Cd(1);
deviceVector<scalar *> Gd(1);
scalar * Eh = & BLAS_E[0];
scalar * Rh = & BLAS_R[0];
scalar * Ch = & BLAS_C[0];
scalar * Gh = & BLAS_G[0];
acceleratorPut(Ed[0],Eh);
acceleratorPut(Rd[0],Rh);
acceleratorPut(Cd[0],Ch);
acceleratorPut(Gd[0],Gh);
GridBLAS BLAS;
/////////////////////////////////////////
// C_er = E^dag R
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw,
scalar(1.0),
Ed,
Rd,
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
assert(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nev*nrhs);
for(int e=0;e<nev;e++){
RealD lam(1.0/eval[e]);
for(int r=0;r<nrhs;r++){
int off = e+nev*r;
HOST_C[off]=HOST_C[off] * lam;
// std::cout << "C["<<e<<"]["<<r<<"] ="<<HOST_C[off]<< " eval[e] "<<eval[e] <<std::endl;
}
}
acceleratorCopyToDevice(&HOST_C[0],&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/////////////////////////////////////////
// Guess G_xr = Exe Cer
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev,
scalar(1.0),
Ed, // x . nev
Cd, // nev . nrhs
scalar(0.0),
Gd);
BLAS.synchronise();
///////////////////////////////////////
// Copy out the multirhs
///////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(v,guess[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
}
RealD t1 = usecond();
std::cout << GridLogMessage << "MultiRHSDeflation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -33,109 +33,111 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Deflation methods considered
* -- Solve P A x = P b [ like Luscher ]
* DEF-1 M P A x = M P b [i.e. left precon]
* DEF-2 P^T M A x = P^T M b
* ADEF-1 Preconditioner = M P + Q [ Q + M + M A Q]
* ADEF-2 Preconditioner = P^T M + Q
* BNN Preconditioner = P^T M P + Q
* BNN2 Preconditioner = M P + P^TM +Q - M P A M
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
* Vout = x
*/
NAMESPACE_BEGIN(Grid);
// abstract base
template<class Field, class CoarseField>
class TwoLevelFlexiblePcg : public LinearFunction<Field>
template<class Field>
class TwoLevelCG : public LinearFunction<Field>
{
public:
int verbose;
RealD Tolerance;
Integer MaxIterations;
const int mmax = 5;
GridBase *grid;
GridBase *coarsegrid;
LinearOperatorBase<Field> *_Linop
OperatorFunction<Field> *_Smoother,
LinearFunction<CoarseField> *_CoarseSolver;
// Need somthing that knows how to get from Coarse to fine and back again
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
// more most opertor functions
TwoLevelFlexiblePcg(RealD tol,
TwoLevelCG(RealD tol,
Integer maxit,
LinearOperatorBase<Field> *Linop,
LinearOperatorBase<Field> *SmootherLinop,
OperatorFunction<Field> *Smoother,
OperatorFunction<CoarseField> CoarseLinop
) :
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_Linop(Linop),
_PreconditionerLinop(PrecLinop),
_Preconditioner(Preconditioner)
_FineLinop(FineLinop),
_Smoother(Smoother)
{
verbose=0;
grid = fine;
};
// The Pcg routine is common to all, but the various matrices differ from derived
// implementation to derived implmentation
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
virtual void operator() (const Field &src, Field &x)
{
std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"<<std::endl;
RealD f;
RealD rtzp,rtz,a,d,b;
RealD rptzp;
RealD tn;
RealD guess = norm2(psi);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 5;
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
std::vector<Field> p(mmax,grid);
std::vector<Field> mmp(mmax,grid);
std::vector<RealD> pAp(mmax);
Field x (grid); x = psi;
Field z(grid);
Field tmp(grid);
Field mp (grid);
Field r (grid);
Field mu (grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
//Initial residual computation & set up
RealD guess = norm2(x);
std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
RealD src_nrm = norm2(src);
std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
if ( src_nrm == 0.0 ) {
std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
x=Zero();
}
RealD tn;
GridStopWatch HDCGTimer;
HDCGTimer.Start();
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
x=src;
Vstart(x,src);
// r0 = b -A x0
HermOp(x,mmp); // Shouldn't this be something else?
_FineLinop.HermOp(x,mmp[0]);
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
{
double n1 = norm2(x);
double n2 = norm2(mmp[0]);
double n3 = norm2(r);
std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
M1(r,z,tmp,mp,SmootherMirs);
PcgM1(r,z);
rtzp =real(innerProduct(r,z));
///////////////////////////////////////
// Solve for Mss mu = P A z and set p = z-mu
// Def2: p = 1 - Q Az = Pright z
// Def2 p = 1 - Q Az = Pright z
// Other algos M2 is trivial
///////////////////////////////////////
M2(z,p[0]);
PcgM2(z,p[0]);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
Field pp(grid);
for (int k=0;k<=MaxIterations;k++){
@@ -143,7 +145,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
int peri_kp = (k+1) % mmax;
rtz=rtzp;
d= M3(p[peri_k],mp,mmp[peri_k],tmp);
d= PcgM3(p[peri_k],mmp[peri_k]);
a = rtz/d;
// Memorise this
@@ -153,21 +155,36 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
// Compute z = M x
M1(r,z,tmp,mp);
PcgM1(r,z);
{
RealD n1,n2;
n1=norm2(r);
n2=norm2(z);
std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
}
rtzp =real(innerProduct(r,z));
std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
// PcgM2(z,p[0]);
PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
p[peri_kp]=p[peri_k];
p[peri_kp]=mu;
// Standard search direction p -> z + b p ; b =
// Standard search direction p -> z + b p
b = (rtzp)/rtz;
int northog;
// k=zero <=> peri_kp=1; northog = 1
// k=1 <=> peri_kp=2; northog = 2
// ... ... ...
// k=mmax-2<=> peri_kp=mmax-1; northog = mmax-1
// k=mmax-1<=> peri_kp=0; northog = 1
// northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
@@ -176,75 +193,324 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
}
RealD rrn=sqrt(rn/ssq);
std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
RealD rtn=sqrt(rtz/ssq);
RealD rtnp=sqrt(rtzp/ssq);
std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
// Stopping condition
if ( rn <= rsq ) {
HermOp(x,mmp); // Shouldn't this be something else?
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
_FineLinop.HermOp(x,mmp[0]);
axpy(tmp,-1.0,src,mmp[0]);
RealD psinorm = sqrt(norm2(x));
RealD mmpnorm = sqrt(norm2(mmp[0]));
RealD xnorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
return k;
std::cout<<GridLogMessage
<<"HDCG: true residual is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
RealD xnorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
src[0].Grid()->Barrier();
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
src[0].Grid()->Barrier();
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
HDCGTimer.Start();
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
a[rhs] = rtz[rhs]/d[rhs];
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
}
// Compute z = M x (for *all* RHS)
PcgM1(r,z);
std::cout << GridLogMessage<<"HDCG::fPcg M1 complete"<<std::endl;
grid->Barrier();
RealD max_rn=0.0;
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
}
// Non-convergence
assert(0);
}
public:
virtual void M(Field & in,Field & out,Field & tmp) {
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
{
std::cout << "PcgM1 default (cheat) mrhs version"<<std::endl;
for(int rhs=0;rhs<in.size();rhs++){
this->PcgM1(in[rhs],out[rhs]);
}
}
virtual void PcgM1(Field & in, Field & out) =0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
std::cout << "Vstart default (cheat) mrhs version"<<std::endl;
for(int rhs=0;rhs<x.size();rhs++){
this->Vstart(x[rhs],src[rhs]);
}
}
virtual void Vstart(Field & x,const Field & src)=0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
}
virtual void M1(Field & in, Field & out) {// the smoother
virtual RealD PcgM3(const Field & p, Field & mmp){
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout.
/////////////////////////////////////////////////////////////////////
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2 : public TwoLevelCG<Field>
{
public:
///////////////////////////////////////////////////////////////////////////////////
// Need something that knows how to get from Coarse to fine and back again
// void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
// void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
///////////////////////////////////////////////////////////////////////////////////
GridBase *coarsegrid;
Aggregation &_Aggregates;
LinearFunction<CoarseField> &_CoarseSolver;
LinearFunction<CoarseField> &_CoarseSolverPrecise;
///////////////////////////////////////////////////////////////////////////////////
// more most opertor functions
TwoLevelADEF2(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates
) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
_CoarseSolver(CoarseSolver),
_CoarseSolverPrecise(CoarseSolverPrecise),
_Aggregates(Aggregates)
{
coarsegrid = Aggregates.CoarseGrid;
};
virtual void PcgM1(Field & in, Field & out)
{
GRID_TRACE("MultiGridPreconditioner ");
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid);
Field Min(grid);
PcgM(in,Min); // Smoother call
Field tmp(this->grid);
Field Min(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
HermOp(Min,out);
GridStopWatch SmootherTimer;
GridStopWatch MatrixTimer;
SmootherTimer.Start();
this->_Smoother(in,Min);
SmootherTimer.Stop();
MatrixTimer.Start();
this->_FineLinop.HermOp(Min,out);
MatrixTimer.Stop();
axpy(tmp,-1.0,out,in); // tmp = in - A Min
ProjectToSubspace(tmp,PleftProj);
ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
GridStopWatch ProjTimer;
GridStopWatch CoarseTimer;
GridStopWatch PromTimer;
ProjTimer.Start();
this->_Aggregates.ProjectToSubspace(PleftProj,tmp);
ProjTimer.Stop();
CoarseTimer.Start();
this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
CoarseTimer.Stop();
PromTimer.Start();
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
PromTimer.Stop();
std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tSmoother " << SmootherTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProj " << ProjTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tCoarse " << CoarseTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProm " << PromTimer.Elapsed() <<std::endl;
axpy(out,1.0,Min,tmp); // Min+tmp
}
virtual void M2(const Field & in, Field & out) {
out=in;
// Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
}
virtual RealD M3(const Field & p, Field & mmp){
double d,dd;
HermOpAndNorm(p,mmp,d,dd);
return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
}
virtual void VstartDef2(Field & xconst Field & src){
//case PcgDef2:
//case PcgAdef2:
//case PcgAdef2f:
//case PcgV11f:
virtual void Vstart(Field & x,const Field & src)
{
std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl;
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@@ -256,142 +522,78 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
Field r(grid);
Field mmp(grid);
Field r(this->grid);
Field mmp(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
HermOp(x,mmp);
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
ProjectToSubspace(r,PleftProj);
ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
PromoteFromSubspace(PleftMss_proj,mmp);
x=x+mmp;
std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl;
this->_Aggregates.ProjectToSubspace(PleftProj,src);
std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl;
this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);
}
};
template<class Field>
class TwoLevelADEF1defl : public TwoLevelCG<Field>
{
public:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
TwoLevelADEF1defl(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
std::vector<Field> &_evec,
std::vector<RealD> &_eval) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
evec(_evec),
eval(_eval)
{};
// Can just inherit existing M2
// Can just inherit existing M3
// Simple vstart - do nothing
virtual void Vstart(Field & x,const Field & src){
return;
x=src; // Could apply Q
};
// Override PcgM1
virtual void PcgM1(Field & in, Field & out)
{
GRID_TRACE("EvecPreconditioner ");
int N=evec.size();
Field Pin(this->grid);
Field Qin(this->grid);
//MP + Q = M(1-AQ) + Q = M
// // If we are eigenvector deflating in coarse space
// // Q = Sum_i |phi_i> 1/lambda_i <phi_i|
// // A Q = Sum_i |phi_i> <phi_i|
// // M(1-AQ) = M(1-proj) + Q
Qin.Checkerboard()=in.Checkerboard();
Qin = Zero();
Pin = in;
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
auto ip = TensorRemove(innerProduct(tmp,in));
axpy(Qin, ip / eval[i],tmp,Qin);
axpy(Pin, -ip ,tmp,Pin);
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
this->_Smoother(Pin,out);
out = out + Qin;
}
};
////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out){
// P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ]
Field in_sbar(grid);
NAMESPACE_END(Grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
HermOp(in_sbar,out);
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
}
virtual void Pleft (Field & in,Field & out){
// P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ]
Field in_sbar(grid);
Field tmp2(grid);
Field Mtmp(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
PromoteFromSubspace(PleftMss_proj,out);
HermOp(out,Mtmp);
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
}
}
template<class Field>
class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp){
}
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
}
virtual void M2(Field & in, Field & out){
}
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
}
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
}
}
/*
template<class Field>
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif

View File

@@ -0,0 +1,734 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/AdefGeneric.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
/*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
*/
NAMESPACE_BEGIN(Grid);
template<class Field>
class TwoLevelCGmrhs
{
public:
RealD Tolerance;
Integer MaxIterations;
GridBase *grid;
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer;
GridStopWatch DeflateTimer;
GridStopWatch CoarseTimer;
GridStopWatch FineTimer;
GridStopWatch SmoothTimer;
GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions
TwoLevelCGmrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{
grid = fine;
};
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
assert(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
// std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
M3Timer.Start();
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
M3Timer.Stop();
a[rhs] = rtz[rhs]/d[rhs];
LinalgTimer.Start();
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
LinalgTimer.Stop();
}
// Compute z = M x (for *all* RHS)
M1Timer.Start();
PcgM1(r,z);
M1Timer.Stop();
RealD max_rn=0.0;
LinalgTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
// std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG:fPcg rhs "<<rhs<<" k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
LinalgTimer.Stop();
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : fine M3 "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
}
public:
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out) = 0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src) = 0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
}
virtual RealD PcgM3(const Field & p, Field & mmp){
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
};
template<class Field, class CoarseField>
class TwoLevelADEF2mrhs : public TwoLevelCGmrhs<Field>
{
public:
GridBase *coarsegrid;
GridBase *coarsegridmrhs;
LinearFunction<CoarseField> &_CoarseSolverMrhs;
LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
MultiRHSBlockProject<Field> &_Projector;
MultiRHSDeflation<CoarseField> &_Deflator;
TwoLevelADEF2mrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolverMrhs,
LinearFunction<CoarseField> &CoarseSolverPreciseMrhs,
MultiRHSBlockProject<Field> &Projector,
MultiRHSDeflation<CoarseField> &Deflator,
GridBase *_coarsemrhsgrid) :
TwoLevelCGmrhs<Field>(tol, maxit,FineLinop,Smoother,Projector.fine_grid),
_CoarseSolverMrhs(CoarseSolverMrhs),
_CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
_Projector(Projector),
_Deflator(Deflator)
{
coarsegrid = Projector.coarse_grid;
coarsegridmrhs = _coarsemrhsgrid;// Thi could be in projector
};
// Override Vstart
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
int nrhs=x.size();
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
// = [1 - Ass_inv A] Guess + Assinv src
// = P^T guess + Assinv src
// = Vstart [Tang notation]
// This gives:
// W^T (src - A x_0) = src_s - A guess_s - r_s
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
this->_Projector.blockProject(src,PleftProj);
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->_Projector.blockPromote(x,PleftMss_proj);
}
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
int nrhs=in.size();
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
std::vector<Field> tmp(nrhs,this->grid);
std::vector<Field> Min(nrhs,this->grid);
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop();
}
this->ProjectTimer.Start();
this->_Projector.blockProject(tmp,PleftProj);
this->ProjectTimer.Stop();
this->DeflateTimer.Start();
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
this->DeflateTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->InsertTimer.Stop();
this->CoarseTimer.Start();
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
this->CoarseTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->InsertTimer.Stop();
this->PromoteTimer.Start();
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop();
this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
}
// this->zzz=out[0];
this->FineTimer.Stop();
}
};
NAMESPACE_END(Grid);

View File

@@ -31,6 +31,58 @@ directory
NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
//////////////////////////////////////////////////////////////////////////
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R)
{
InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint();
Cinv = C.inverse();
@@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q;
@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer;
SolverTimer.Start();
RealD max_resid=0;
int k;
for (k = 1; k <= MaxIterations; k++) {
@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
max_resid=0;
RealD rrsum=0;
RealD rr;
@@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
@@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
IterationsToComplete = k;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation:
//--------------------------
@@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
}
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);

View File

@@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template <class Field>
class ConjugateGradient : public OperatorFunction<Field> {
public:
@@ -54,10 +55,26 @@ public:
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
ErrorOnNoConverge(err_on_no_conv)
{};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
@@ -65,22 +82,32 @@ public:
RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src);
Field mmp(src);
Field r(src);
// Was doing copies
ConstructTimer.Start();
Field p (src.Grid());
Field mmp(src.Grid());
Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src);
RealD guess = norm2(psi);
NormTimer.Stop();
assert(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) {
r = src;
p = r;
a = ssq;
} else {
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
}
cp = a;
ssq = norm2(src);
AssignTimer.Stop();
// Handle trivial case of zero src
if (ssq == 0.){
@@ -110,6 +137,7 @@ public:
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
PreambleTimer.Stop();
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
@@ -117,9 +145,13 @@ public:
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
RealD usecs = -usecond();
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
GridStopWatch IterationTimer;
IterationTimer.Start();
c = cp;
MatrixTimer.Start();
@@ -151,32 +183,44 @@ public:
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
LogIteration(k,a,b);
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
IterationTimer.Stop();
if ( (k % 500) == 0 ) {
std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
} else {
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
}
// Stopping condition
if (cp <= rsq) {
usecs +=usecond();
SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
GridBase *grid = src.Grid();
RealD DwfFlops = (1452. )*grid->gSites()*4*k
+ (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
<< "\tComputed residual " << std::sqrt(cp / ssq)
<< "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl;
std::cout << GridLogIterative << "Time breakdown "<<std::endl;
std::cout << GridLogIterative << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
// std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver Elapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -187,17 +231,143 @@ public:
}
}
// Failed. Calculate true residual before giving up
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
// Linop.HermOpAndNorm(psi, mmp, d, qq);
// p = mmp - src;
//TrueResidual = sqrt(norm2(p)/ssq);
// TrueResidual = 1;
TrueResidual = sqrt(norm2(p)/ssq);
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage<< "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -82,11 +82,6 @@ NAMESPACE_BEGIN(Grid);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid();
//Generate precision change workspaces
precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
@@ -114,21 +109,24 @@ NAMESPACE_BEGIN(Grid);
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d, wk_sp_from_dp);
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
sol_f = Zero();
@@ -147,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f, wk_dp_from_sp);
precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);

View File

@@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -44,7 +44,7 @@ public:
using OperatorFunction<Field>::operator();
RealD Tolerance;
// RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@@ -84,6 +84,7 @@ public:
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GRID_TRACE("ConjugateGradientMultiShift");
GridBase *grid = src.Grid();
@@ -101,11 +102,11 @@ public:
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
// remove dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;
@@ -143,7 +144,7 @@ public:
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
<<" target resid^2 "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
@@ -325,7 +326,7 @@ public:
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;

View File

@@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChange(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChange(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChange(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);

View File

@@ -81,6 +81,7 @@ public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@@ -95,9 +96,9 @@ public:
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
@@ -127,9 +128,11 @@ public:
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
@@ -153,10 +156,11 @@ public:
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;
@@ -165,12 +169,8 @@ public:
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF r_f(SinglePrecGrid);
FieldF p_f(SinglePrecGrid);
FieldF tmp_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
FieldF src_f(SinglePrecGrid);
precisionChange(src_f, src_d, wk_f_from_d);
// Check lightest mass
for(int s=0;s<nshift;s++){
@@ -195,18 +195,26 @@ public:
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
r_f=src_f; //residual maintained in single
p_f=src_f;
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f, p_d, pc_wk_d_to_s);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
axpy(mmp_f,mass[0],p_f,mmp_f);
RealD rn = norm2(p_f);
precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
@@ -224,7 +232,7 @@ public:
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_f,b,mmp_f,r_f);
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
@@ -240,14 +248,9 @@ public:
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
//Update double precision search direction by residual
PrecChangeTimer.Start();
precisionChange(r_d, r_f, wk_d_from_f);
PrecChangeTimer.Stop();
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
@@ -264,19 +267,23 @@ public:
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
d=real(innerProduct(p_f,mmp_f));
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
PrecChangeTimer.Stop();
AXPYTimer.Start();
axpy(mmp_f,mass[0],p_f,mmp_f);
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
AXPYTimer.Stop();
RealD rn = norm2(p_f);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
@@ -307,12 +314,12 @@ public:
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
c = c_f;
if(k % ReliableUpdateFreq == 0){
RealD c_old = c;
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
@@ -321,15 +328,10 @@ public:
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
c = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
PrecChangeTimer.Start();
precisionChange(r_f, r_d, wk_f_from_d);
PrecChangeTimer.Stop();
c = c_d;
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
}
// Convergence checks
@@ -341,7 +343,7 @@ public:
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
@@ -352,11 +354,16 @@ public:
}
}
if ( all_converged ){
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
@@ -399,11 +406,9 @@ public:
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
assert(0);
}
};

View File

@@ -48,7 +48,7 @@ public:
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,7 +65,9 @@ public:
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
{
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
@@ -73,6 +75,7 @@ public:
}
void operator()(const FieldD &src, FieldD &psi) {
GRID_TRACE("ConjugateGradientReliableUpdate");
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
@@ -115,9 +118,12 @@ public:
}
//Single prec initialization
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r);
precisionChange(r_f, r, pc_wk_dp_to_sp);
FieldF psi_f(r_f);
psi_f = Zero();
@@ -133,6 +139,7 @@ public:
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
GridStopWatch PrecChangeTimer;
SolverTimer.Start();
int k = 0;
@@ -172,7 +179,9 @@ public:
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
@@ -193,6 +202,9 @@ public:
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
@@ -213,14 +225,21 @@ public:
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
MatrixTimer.Start();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
MatrixTimer.Stop();
r = src - mmp;
psi_f = Zero();
precisionChange(r_f, r);
PrecChangeTimer.Start();
precisionChange(r_f, r, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
cp = norm2(r);
MaxResidSinceLastRelUp = cp;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -79,14 +79,16 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public Imp
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<" target " << eresid*eresid << " conv " <<conv
<<std::endl;
return conv;
}
};
@@ -243,9 +245,10 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@@ -253,6 +256,7 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);
@@ -419,14 +423,15 @@ until convergence
}
}
if ( Nconv < Nstop )
if ( Nconv < Nstop ) {
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
}
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
eval.resize(Nstop);// was Nconv
evec.resize(Nstop,grid);// was Nconv
basisSortInPlace(evec,eval,reverse);
}
@@ -456,7 +461,7 @@ until convergence
std::vector<Field>& evec,
Field& w,int Nm,int k)
{
std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20;
assert( k< Nm );
@@ -464,7 +469,7 @@ until convergence
Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
_PolyOp(evec_k,w); std::cout<<GridLogDebug << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1];
@@ -479,18 +484,18 @@ until convergence
lme[k] = beta;
if ( (k>0) && ( (k % orth_period) == 0 )) {
std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
}
if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,

View File

@@ -147,13 +147,20 @@ public:
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
//As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
//To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
//out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
//NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
RealD coarse_relax_tol=5.0e3,
int largestEvalIdxForReport=-1)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
_coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
{ };
//evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
@@ -179,6 +186,12 @@ public:
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
RealD tmp_eval;
ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
}
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
@@ -409,7 +422,7 @@ public:
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);

View File

@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations {
template<class Field> class NormalEquations : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
@@ -60,7 +60,33 @@ public:
}
};
template<class Field> class HPDSolver {
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> {
private:
LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
@@ -78,13 +104,13 @@ public:
void operator() (const Field &in, Field &out){
_Guess(in,out);
_HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in
_HermitianSolver(_Matrix,in,out); //M out = in
}
};
template<class Field> class MdagMSolver {
template<class Field> class MdagMSolver : public LinearFunction<Field> {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;

View File

@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0;
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_EST_ = 50;
const int _MAX_ITER_EST_ = 200;
for (int i=0;i<_MAX_ITER_EST_;i++) {
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n);
RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
// evalMaxApprox = na;
// return evalMaxApprox;
// }
evalMaxApprox = na;
src_n = tmp;
}
assert(0);
return 0;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
};
}

View File

@@ -0,0 +1,76 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){
psi=Zero();
// psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;

View File

@@ -499,6 +499,87 @@ namespace Grid {
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta

View File

@@ -0,0 +1,931 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LANC_H
#define GRID_LANC_H
#include <string.h> //memset
#ifdef USE_LAPACK
#ifdef USE_MKL
#include<mkl_lapack.h>
#else
void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
double *vl, double *vu, int *il, int *iu, double *abstol,
int *m, double *w, double *z, int *ldz, int *isuppz,
double *work, int *lwork, int *iwork, int *liwork,
int *info);
//#include <lapacke/lapacke.h>
#endif
#endif
//#include <Grid/algorithms/densematrix/DenseMatrix.h>
// eliminate temorary vector in calc()
#define MEM_SAVE
namespace Grid
{
struct Bisection
{
#if 0
static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
std::vector < RealD > &BETA,
std::vector < RealD > &eig)
{
int i, j;
std::vector < RealD > evec1 (row_num + 3);
std::vector < RealD > evec2 (row_num + 3);
RealD eps2;
ALPHA[1] = 0.;
BETHA[1] = 0.;
for (i = 0; i < row_num - 1; i++)
{
ALPHA[i + 1] = A[i * (row_num + 1)].real ();
BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
}
ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
// Do we really need to sort here?
int begin = 1;
int end = row_num;
int swapped = 1;
while (swapped)
{
swapped = 0;
for (i = begin; i < end; i++)
{
if (mag (evec2[i]) > mag (evec2[i + 1]))
{
swap (evec2 + i, evec2 + i + 1);
swapped = 1;
}
}
end--;
for (i = end - 1; i >= begin; i--)
{
if (mag (evec2[i]) > mag (evec2[i + 1]))
{
swap (evec2 + i, evec2 + i + 1);
swapped = 1;
}
}
begin++;
}
for (i = 0; i < row_num; i++)
{
for (j = 0; j < row_num; j++)
{
if (i == j)
H[i * row_num + j] = evec2[i + 1];
else
H[i * row_num + j] = 0.;
}
}
}
#endif
static void bisec (std::vector < RealD > &c,
std::vector < RealD > &b,
int n,
int m1,
int m2,
RealD eps1,
RealD relfeh, std::vector < RealD > &x, RealD & eps2)
{
std::vector < RealD > wu (n + 2);
RealD h, q, x1, xu, x0, xmin, xmax;
int i, a, k;
b[1] = 0.0;
xmin = c[n] - fabs (b[n]);
xmax = c[n] + fabs (b[n]);
for (i = 1; i < n; i++)
{
h = fabs (b[i]) + fabs (b[i + 1]);
if (c[i] + h > xmax)
xmax = c[i] + h;
if (c[i] - h < xmin)
xmin = c[i] - h;
}
xmax *= 2.;
eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
if (eps1 <= 0.0)
eps1 = eps2;
eps2 = 0.5 * eps1 + 7.0 * (eps2);
x0 = xmax;
for (i = m1; i <= m2; i++)
{
x[i] = xmax;
wu[i] = xmin;
}
for (k = m2; k >= m1; k--)
{
xu = xmin;
i = k;
do
{
if (xu < wu[i])
{
xu = wu[i];
i = m1 - 1;
}
i--;
}
while (i >= m1);
if (x0 > x[k])
x0 = x[k];
while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
{
x1 = (xu + x0) / 2;
a = 0;
q = 1.0;
for (i = 1; i <= n; i++)
{
q =
c[i] - x1 -
((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
if (q < 0)
a++;
}
// printf("x1=%0.14e a=%d\n",x1,a);
if (a < k)
{
if (a < m1)
{
xu = x1;
wu[m1] = x1;
}
else
{
xu = x1;
wu[a + 1] = x1;
if (x[a] > x1)
x[a] = x1;
}
}
else
x0 = x1;
}
printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
x[k] = (x0 + xu) / 2;
}
}
};
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
template < class Field > class SimpleLanczos
{
const RealD small = 1.0e-16;
public:
int lock;
int get;
int Niter;
int converged;
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
int Np; // Np -- Number of spare vecs in kryloc space
int Nm; // Nm -- total number of vectors
RealD OrthoTime;
RealD eresid;
// SortEigen < Field > _sort;
LinearFunction < Field > &_Linop;
// OperatorFunction < Field > &_poly;
/////////////////////////
// Constructor
/////////////////////////
void init (void)
{
};
// void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector < RealD > >&evecs);
SimpleLanczos (LinearFunction < Field > &Linop, // op
// OperatorFunction < Field > &poly, // polynmial
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _Niter): // Max iterations
_Linop (Linop),
// _poly (poly),
Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
{
Np = Nm - Nk;
assert (Np > 0);
};
/////////////////////////
// Sanity checked this routine (step) against Saad.
/////////////////////////
void RitzMatrix (std::vector < Field > &evec, int k)
{
if (1)
return;
GridBase *grid = evec[0].Grid();
Field w (grid);
std::cout << GridLogMessage << "RitzMatrix " << std::endl;
for (int i = 0; i < k; i++)
{
_Linop(evec[i], w);
// _poly(_Linop,evec[i],w);
std::cout << GridLogMessage << "[" << i << "] ";
for (int j = 0; j < k; j++)
{
ComplexD in = innerProduct (evec[j], w);
if (fabs ((double) i - j) > 1)
{
if (abs (in) > 1.0e-9)
{
std::cout << GridLogMessage << "oops" << std::endl;
abort ();
}
else
std::cout << GridLogMessage << " 0 ";
}
else
{
std::cout << GridLogMessage << " " << in << " ";
}
}
std::cout << GridLogMessage << std::endl;
}
}
void step (std::vector < RealD > &lmd,
std::vector < RealD > &lme,
Field & last, Field & current, Field & next, uint64_t k)
{
if (lmd.size () <= k)
lmd.resize (k + Nm);
if (lme.size () <= k)
lme.resize (k + Nm);
// _poly(_Linop,current,next ); // 3. wk:=Avkβkv_{k1}
_Linop(current, next); // 3. wk:=Avkβkv_{k1}
if (k > 0)
{
next -= lme[k - 1] * last;
}
// std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
ComplexD zalph = innerProduct (current, next); // 4. αk:=(wk,vk)
RealD alph = real (zalph);
next = next - alph * current; // 5. wk:=wkαkvk
// std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
RealD beta = normalise (next); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
// norm=beta;
int interval = Nm / 100 + 1;
if ((k % interval) == 0)
std::
cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
beta << std::endl;
const RealD tiny = 1.0e-20;
if (beta < tiny)
{
std::cout << GridLogMessage << " beta is tiny " << beta << std::
endl;
}
lmd[k] = alph;
lme[k] = beta;
}
void qr_decomp (std::vector < RealD > &lmd,
std::vector < RealD > &lme,
int Nk,
int Nm,
std::vector < RealD > &Qt, RealD Dsh, int kmin, int kmax)
{
int k = kmin - 1;
RealD x;
RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
RealD c = (lmd[k] - Dsh) * Fden;
RealD s = -lme[k] * Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k + 1];
RealD tmpb = lme[k];
lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
x = -s * lme[k + 1];
lme[k + 1] = c * lme[k + 1];
for (int i = 0; i < Nk; ++i)
{
RealD Qtmp1 = Qt[i + Nm * k];
RealD Qtmp2 = Qt[i + Nm * (k + 1)];
Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
}
// Givens transformations
for (int k = kmin; k < kmax - 1; ++k)
{
RealD Fden = 1.0 / hypot (x, lme[k - 1]);
RealD c = lme[k - 1] * Fden;
RealD s = -x * Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k + 1];
RealD tmpb = lme[k];
lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
lme[k - 1] = c * lme[k - 1] - s * x;
if (k != kmax - 2)
{
x = -s * lme[k + 1];
lme[k + 1] = c * lme[k + 1];
}
for (int i = 0; i < Nk; ++i)
{
RealD Qtmp1 = Qt[i + Nm * k];
RealD Qtmp2 = Qt[i + Nm * (k + 1)];
Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
}
}
}
#if 0
#ifdef USE_LAPACK
#ifdef USE_MKL
#define LAPACK_INT MKL_INT
#else
#define LAPACK_INT long long
#endif
void diagonalize_lapack (std::vector < RealD > &lmd, std::vector < RealD > &lme, int N1, // all
int N2, // get
GridBase * grid)
{
const int size = Nm;
LAPACK_INT NN = N1;
double evals_tmp[NN];
double DD[NN];
double EE[NN];
for (int i = 0; i < NN; i++)
for (int j = i - 1; j <= i + 1; j++)
if (j < NN && j >= 0)
{
if (i == j)
DD[i] = lmd[i];
if (i == j)
evals_tmp[i] = lmd[i];
if (j == (i - 1))
EE[j] = lme[j];
}
LAPACK_INT evals_found;
LAPACK_INT lwork =
((18 * NN) >
(1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
LAPACK_INT liwork = 3 + NN * 10;
LAPACK_INT iwork[liwork];
double work[lwork];
LAPACK_INT isuppz[2 * NN];
char jobz = 'N'; // calculate evals only
char range = 'I'; // calculate il-th to iu-th evals
// char range = 'A'; // calculate all evals
char uplo = 'U'; // refer to upper half of original matrix
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
int ifail[NN];
LAPACK_INT info;
// int total = QMP_get_number_of_nodes();
// int node = QMP_get_node_number();
// GridBase *grid = evec[0]._grid;
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (NN / total) + 1;
double vl = 0.0, vu = 0.0;
LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
if (iu > NN)
iu = NN;
double tol = 0.0;
if (1)
{
memset (evals_tmp, 0, sizeof (double) * NN);
if (il <= NN)
{
printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
#ifdef USE_MKL
dstegr (&jobz, &range, &NN,
#else
LAPACK_dstegr (&jobz, &range, &NN,
#endif
(double *) DD, (double *) EE, &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
&tol, // tolerance
&evals_found, evals_tmp, (double *) NULL, &NN,
isuppz, work, &lwork, iwork, &liwork, &info);
for (int i = iu - 1; i >= il - 1; i--)
{
printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
evals_tmp[i] = evals_tmp[i - (il - 1)];
if (il > 1)
evals_tmp[i - (il - 1)] = 0.;
}
}
{
grid->GlobalSumVector (evals_tmp, NN);
}
}
// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
}
#undef LAPACK_INT
#endif
void diagonalize (std::vector < RealD > &lmd,
std::vector < RealD > &lme,
int N2, int N1, GridBase * grid)
{
#ifdef USE_LAPACK
const int check_lapack = 0; // just use lapack if 0, check against lapack if 1
if (!check_lapack)
return diagonalize_lapack (lmd, lme, N2, N1, grid);
// diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
#endif
}
#endif
static RealD normalise (Field & v)
{
RealD nn = norm2 (v);
nn = sqrt (nn);
v = v * (1.0 / nn);
return nn;
}
void orthogonalize (Field & w, std::vector < Field > &evec, int k)
{
double t0 = -usecond () / 1e6;
typedef typename Field::scalar_type MyComplex;
MyComplex ip;
if (0)
{
for (int j = 0; j < k; ++j)
{
normalise (evec[j]);
for (int i = 0; i < j; i++)
{
ip = innerProduct (evec[i], evec[j]); // are the evecs normalised? ; this assumes so.
evec[j] = evec[j] - ip * evec[i];
}
}
}
for (int j = 0; j < k; ++j)
{
ip = innerProduct (evec[j], w); // are the evecs normalised? ; this assumes so.
w = w - ip * evec[j];
}
normalise (w);
t0 += usecond () / 1e6;
OrthoTime += t0;
}
void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
{
for (int i = 0; i < Qt.size (); ++i)
Qt[i] = 0.0;
for (int k = 0; k < Nm; ++k)
Qt[k + k * Nm] = 1.0;
}
void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
{
GridBase *grid = src.Grid();
// assert(grid == src._grid);
std::
cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
endl;
std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
std::cout << GridLogMessage << " -- size of eval = " << eval.
size () << std::endl;
// assert(c.size() && Nm == eval.size());
std::vector < RealD > lme (Nm);
std::vector < RealD > lmd (Nm);
Field current (grid);
Field last (grid);
Field next (grid);
Nconv = 0;
RealD beta_k;
// Set initial vector
// (uniform vector) Why not src??
// evec[0] = 1.0;
current = src;
std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
endl;
normalise (current);
std::
cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
std::endl;
// Initial Nk steps
OrthoTime = 0.;
double t0 = usecond () / 1e6;
RealD norm; // sqrt norm of last vector
uint64_t iter = 0;
bool initted = false;
std::vector < RealD > low (Nstop * 10);
std::vector < RealD > high (Nstop * 10);
RealD cont = 0.;
while (1) {
cont = 0.;
std::vector < RealD > lme2 (Nm);
std::vector < RealD > lmd2 (Nm);
for (uint64_t k = 0; k < Nm; ++k, iter++) {
step (lmd, lme, last, current, next, iter);
last = current;
current = next;
}
double t1 = usecond () / 1e6;
std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
t0 << "seconds" << std::endl;
t0 = t1;
std::
cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
OrthoTime << "seconds" << std::endl;
// getting eigenvalues
lmd2.resize (iter + 2);
lme2.resize (iter + 2);
for (uint64_t k = 0; k < iter; ++k) {
lmd2[k + 1] = lmd[k];
lme2[k + 2] = lme[k];
}
t1 = usecond () / 1e6;
std::cout << GridLogMessage << "IRL:: copy: " << t1 -
t0 << "seconds" << std::endl;
t0 = t1;
{
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (Nstop / total) + 1;
int iu = (iter + 1) - (interval * node + 1);
int il = (iter + 1) - (interval * (node + 1));
std::vector < RealD > eval2 (iter + 3);
RealD eps2;
Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
eps2);
// diagonalize(eval2,lme2,iter,Nk,grid);
RealD diff = 0.;
for (int i = il; i <= iu; i++) {
if (initted)
diff =
fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
fabs (high[iu-i]));
if (initted && (diff > eresid))
cont = 1.;
if (initted)
printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
high[iu-i], diff);
high[iu-i] = eval2[i];
}
il = (interval * node + 1);
iu = (interval * (node + 1));
Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
eps2);
for (int i = il; i <= iu; i++) {
if (initted)
diff =
fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
fabs (low[i]));
if (initted && (diff > eresid))
cont = 1.;
if (initted)
printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
low[i], diff);
low[i] = eval2[i];
}
t1 = usecond () / 1e6;
std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
t0 << "seconds" << std::endl;
t0 = t1;
}
for (uint64_t k = 0; k < Nk; ++k) {
// eval[k] = eval2[k];
}
if (initted)
{
grid->GlobalSumVector (&cont, 1);
if (cont < 1.) return;
}
initted = true;
}
}
#if 0
/**
There is some matrix Q such that for any vector y
Q.e_1 = y and Q is unitary.
**/
template < class T >
static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
{
int N = y.size (); //Matrix Size
Fill (Q, 0.0);
T tau;
for (int i = 0; i < N; i++)
{
Q[i][0] = y[i];
}
T sig = conj (y[0]) * y[0];
T tau0 = fabs (sqrt (sig));
for (int j = 1; j < N; j++)
{
sig += conj (y[j]) * y[j];
tau = abs (sqrt (sig));
if (abs (tau0) > 0.0)
{
T gam = conj ((y[j] / tau) / tau0);
for (int k = 0; k <= j - 1; k++)
{
Q[k][j] = -gam * y[k];
}
Q[j][j] = tau0 / tau;
}
else
{
Q[j - 1][j] = 1.0;
}
tau0 = tau;
}
return tau;
}
/**
There is some matrix Q such that for any vector y
Q.e_k = y and Q is unitary.
**/
template < class T >
static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
{
T tau = orthQ (Q, y);
SL (Q);
return tau;
}
/**
Wind up with a matrix with the first con rows untouched
say con = 2
Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
and the matrix is upper hessenberg
and with f and Q appropriately modidied with Q is the arnoldi factorization
**/
template < class T > static void Lock (DenseMatrix < T > &H, ///Hess mtx
DenseMatrix < T > &Q, ///Lock Transform
T val, ///value to be locked
int con, ///number already locked
RealD small, int dfg, bool herm)
{
//ForceTridiagonal(H);
int M = H.dim;
DenseVector < T > vec;
Resize (vec, M - con);
DenseMatrix < T > AH;
Resize (AH, M - con, M - con);
AH = GetSubMtx (H, con, M, con, M);
DenseMatrix < T > QQ;
Resize (QQ, M - con, M - con);
Unity (Q);
Unity (QQ);
DenseVector < T > evals;
Resize (evals, M - con);
DenseMatrix < T > evecs;
Resize (evecs, M - con, M - con);
Wilkinson < T > (AH, evals, evecs, small);
int k = 0;
RealD cold = abs (val - evals[k]);
for (int i = 1; i < M - con; i++)
{
RealD cnew = abs (val - evals[i]);
if (cnew < cold)
{
k = i;
cold = cnew;
}
}
vec = evecs[k];
ComplexD tau;
orthQ (QQ, vec);
//orthQM(QQ,AH,vec);
AH = Hermitian (QQ) * AH;
AH = AH * QQ;
for (int i = con; i < M; i++)
{
for (int j = con; j < M; j++)
{
Q[i][j] = QQ[i - con][j - con];
H[i][j] = AH[i - con][j - con];
}
}
for (int j = M - 1; j > con + 2; j--)
{
DenseMatrix < T > U;
Resize (U, j - 1 - con, j - 1 - con);
DenseVector < T > z;
Resize (z, j - 1 - con);
T nm = norm (z);
for (int k = con + 0; k < j - 1; k++)
{
z[k - con] = conj (H (j, k + 1));
}
normalise (z);
RealD tmp = 0;
for (int i = 0; i < z.size () - 1; i++)
{
tmp = tmp + abs (z[i]);
}
if (tmp < small / ((RealD) z.size () - 1.0))
{
continue;
}
tau = orthU (U, z);
DenseMatrix < T > Hb;
Resize (Hb, j - 1 - con, M);
for (int a = 0; a < M; a++)
{
for (int b = 0; b < j - 1 - con; b++)
{
T sum = 0;
for (int c = 0; c < j - 1 - con; c++)
{
sum += H[a][con + 1 + c] * U[c][b];
} //sum += H(a,con+1+c)*U(c,b);}
Hb[b][a] = sum;
}
}
for (int k = con + 1; k < j; k++)
{
for (int l = 0; l < M; l++)
{
H[l][k] = Hb[k - 1 - con][l];
}
} //H(Hb[k-1-con][l] , l,k);}}
DenseMatrix < T > Qb;
Resize (Qb, M, M);
for (int a = 0; a < M; a++)
{
for (int b = 0; b < j - 1 - con; b++)
{
T sum = 0;
for (int c = 0; c < j - 1 - con; c++)
{
sum += Q[a][con + 1 + c] * U[c][b];
} //sum += Q(a,con+1+c)*U(c,b);}
Qb[b][a] = sum;
}
}
for (int k = con + 1; k < j; k++)
{
for (int l = 0; l < M; l++)
{
Q[l][k] = Qb[k - 1 - con][l];
}
} //Q(Qb[k-1-con][l] , l,k);}}
DenseMatrix < T > Hc;
Resize (Hc, M, M);
for (int a = 0; a < j - 1 - con; a++)
{
for (int b = 0; b < M; b++)
{
T sum = 0;
for (int c = 0; c < j - 1 - con; c++)
{
sum += conj (U[c][a]) * H[con + 1 + c][b];
} //sum += conj( U(c,a) )*H(con+1+c,b);}
Hc[b][a] = sum;
}
}
for (int k = 0; k < M; k++)
{
for (int l = con + 1; l < j; l++)
{
H[l][k] = Hc[k][l - 1 - con];
}
} //H(Hc[k][l-1-con] , l,k);}}
}
}
#endif
};
}
#endif

View File

@@ -0,0 +1,608 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/Aggregates.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x)
{
// return std::pow(x,-4);
// return std::pow(x,-3);
return std::pow(x,-5);
}
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
constexpr int Nbasis(void) { return nbasis; };
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspaceRandom(GridParallelRNG &RNG) {
int nn=nbasis;
RealD scale;
FineField noise(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
subspace[b] = noise;
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
RealD scale;
ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
<<ordermin<<" step "<<orderstep
<<" lo"<<filterlo<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
ComplexD ip;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
assert(b==nn);
}
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
// Refine
Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
noise = Mn;
PowerLaw(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
// normalise
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevNew(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double hi
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
//#opt2(x) = acheb(x,3,90,300)* acheb(x,1,90,50) * acheb(x,0.5,90,200) * acheb(x,0.05,90,400) * acheb(x,0.01,90,1500)
/*266
Chebyshev<FineField> Cheb1(3.0,hi,300);
Chebyshev<FineField> Cheb2(1.0,hi,50);
Chebyshev<FineField> Cheb3(0.5,hi,300);
Chebyshev<FineField> Cheb4(0.05,hi,500);
Chebyshev<FineField> Cheb5(0.01,hi,2000);
*/
/* 242 */
/*
Chebyshev<FineField> Cheb3(0.1,hi,300);
Chebyshev<FineField> Cheb2(0.02,hi,1000);
Chebyshev<FineField> Cheb1(0.003,hi,2000);
8?
*/
/* How many??
*/
Chebyshev<FineField> Cheb2(0.001,hi,2500); // 169 iters on HDCG after refine
Chebyshev<FineField> Cheb1(0.02,hi,600);
// Chebyshev<FineField> Cheb2(0.001,hi,1500);
// Chebyshev<FineField> Cheb1(0.02,hi,600);
Cheb1(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb1 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
Cheb2(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb2 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb3(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb3 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb4(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb4 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb5(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb5 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
subspace[b] = noise;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<< " norm " << norm2(noise)<<std::endl;
}
}
virtual void CreateSubspaceMultishift(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double Lo,double tol,int maxit)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
// Filter
// [ 1/6(x+Lo) - 1/2(x+2Lo) + 1/2(x+3Lo) -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
//
// 1/(x+Lo) - 1/(x+2 Lo)
double epsilon = Lo/3;
std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
std::vector<RealD> tols({tol,tol,tol,tol});
std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
MultiShiftFunction msf(4,0.0,95.0);
std::cout << "msf constructed "<<std::endl;
msf.poles=shifts;
msf.residues=alpha;
msf.tolerances=tols;
msf.norm=0.0;
msf.order=alpha.size();
ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
MSCG(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
double Lo,double tol,int maxit)
{
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b++)
{
ConjugateGradient<FineField> CGsloppy(tol,maxit,false);
ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,Lo);
tmp=Zero();
CGsloppy(hermop,subspace[b],tmp);
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b]=tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspaceHDCG(LinearOperatorBase<FineField> &hermop,
TwoLevelADEF2mrhs<FineField,CoarseVector> & theHDCG,
int nrhs)
{
std::vector<FineField> src_mrhs(nrhs,FineGrid);
std::vector<FineField> res_mrhs(nrhs,FineGrid);
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b+=nrhs)
{
tmp = subspace[b];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b] =tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "before filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
src_mrhs[r] = subspace[b+r];
}
for(int r=0;r<nrhs;r++){
res_mrhs[r] = Zero();
}
theHDCG(src_mrhs,res_mrhs);
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
tmp = res_mrhs[r];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b+r]=tmp;
}
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "after filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
};
NAMESPACE_END(Grid);

View File

@@ -56,243 +56,6 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
blockSum(CoarseInner,fine_inner_msk);
}
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
assert(b==nn);
}
};
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
@@ -324,9 +87,9 @@ public:
GridBase* _cbgrid;
int hermitian;
CartesianStencil<siteVector,siteVector,int> Stencil;
CartesianStencil<siteVector,siteVector,int> StencilEven;
CartesianStencil<siteVector,siteVector,int> StencilOdd;
CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil;
CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
std::vector<CoarseMatrix> A;
std::vector<CoarseMatrix> Aeven;
@@ -336,7 +99,7 @@ public:
CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor;
deviceVector<RealD> dag_factor;
///////////////////////
// Interface
@@ -361,9 +124,13 @@ public:
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@@ -398,7 +165,7 @@ public:
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
};
void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -427,9 +194,14 @@ public:
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@@ -438,10 +210,10 @@ public:
int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++)
points[p] = geom.points_dagger[p];
deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) {
acceleratorPut(points[p],geom.points_dagger[p]);
}
auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0];
@@ -473,7 +245,7 @@ public:
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
void MdirComms(const CoarseVector &in)
@@ -488,8 +260,14 @@ public:
out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite);
@@ -522,7 +300,7 @@ public:
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{
@@ -631,7 +409,7 @@ public:
assert(Aself != nullptr);
}
void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
const CoarseVector &in, CoarseVector &out, int dag) {
int point = geom.npoint-1;
autoView( out_v, out, AcceleratorWrite);
@@ -694,7 +472,7 @@ public:
}
}
void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
const CoarseVector &in, CoarseVector &out, int dag) {
SimpleCompressor<siteVector> compressor;
@@ -706,14 +484,20 @@ public:
// determine in what order we need the points
int npoint = geom.npoint-1;
Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++)
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) {
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@@ -776,7 +560,7 @@ public:
});
}
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@@ -784,9 +568,9 @@ public:
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,_cbgrid),
Aodd(geom.npoint,_cbgrid),
@@ -804,9 +588,9 @@ public:
_cbgrid(&CoarseRBGrid),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,&CoarseRBGrid),
Aodd(geom.npoint,&CoarseRBGrid),
@@ -827,11 +611,13 @@ public:
}
// GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, {
int j = i/nbasis;
int k = i%nbasis;
dag_factor[i] = dag_factor_eigen(j, k);
h_dag_factor[i] = dag_factor_eigen(j, k);
});
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
}
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,

View File

@@ -0,0 +1,629 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef Lattice<CComplex > FineComplexField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
int hermitian;
GridBase * _FineGrid;
GridCartesian * _CoarseGrid;
NonLocalStencilGeometry &geom;
PaddedCell Cell;
GeneralLocalStencil Stencil;
std::vector<CoarseMatrix> _A;
std::vector<CoarseMatrix> _Adag;
std::vector<CoarseVector> MultTemporaries;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
GridBase * FineGrid(void) { return _FineGrid; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
/* void ShiftMatrix(RealD shift)
{
int Nd=_FineGrid->Nd();
Coordinate zero_shift(Nd,0);
for(int p=0;p<geom.npoint;p++){
if ( zero_shift==geom.shifts[p] ) {
_A[p] = _A[p]+shift;
// _Adag[p] = _Adag[p]+shift;
}
}
}
void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
{
int nfound=0;
std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
for(int p=0;p<geom.npoint;p++){
for(int pp=0;pp<CopyMe.geom.npoint;pp++){
// Search for the same relative shift
// Avoids brutal handling of Grid pointers
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
// _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
nfound++;
}
}
}
assert(nfound==geom.npoint);
ExchangeCoarseLinks();
}
*/
GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
: geom(_geom),
_FineGrid(FineGrid),
_CoarseGrid(CoarseGrid),
hermitian(1),
Cell(_geom.Depth(),_CoarseGrid),
Stencil(Cell.grids.back(),geom.shifts)
{
{
int npoint = _geom.npoint;
}
_A.resize(geom.npoint,CoarseGrid);
// _Adag.resize(geom.npoint,CoarseGrid);
}
void M (const CoarseVector &in, CoarseVector &out)
{
Mult(_A,in,out);
}
void Mdag (const CoarseVector &in, CoarseVector &out)
{
assert(hermitian);
Mult(_A,in,out);
// if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out);
}
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
{
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
RealD tmult2=0;
ttot=-usecond();
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
CoarseVector tin=in;
texch-=usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin);
texch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t osites=pin.Grid()->oSites();
RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+ 2.0*osites*sizeof(siteVector)*npoint;
{
tviews-=usecond();
autoView( in_v , pin, AcceleratorRead);
autoView( out_v , pout, AcceleratorWriteDiscard);
autoView( Stencil_v , Stencil, AcceleratorRead);
tviews+=usecond();
// Static and prereserve to keep UVM region live and not resized across multiple calls
ttemps-=usecond();
MultTemporaries.resize(npoint,pin.Grid());
ttemps+=usecond();
std::vector<Aview> AcceleratorViewContainer_h;
std::vector<Vview> AcceleratorVecViewContainer_h;
tviews-=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h.push_back( A[p].View(AcceleratorRead));
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
}
tviews+=usecond();
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
auto Aview_p = &AcceleratorViewContainer[0];
auto Vview_p = &AcceleratorVecViewContainer[0];
tcopy-=usecond();
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
tcopy+=usecond();
tmult-=usecond();
accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int32_t ss = spb/(nbasis*npoint);
int32_t bp = spb%(nbasis*npoint);
int32_t point= bp/nbasis;
int32_t b = bp%nbasis;
auto SE = Stencil_v.GetEntry(point,ss);
auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
for(int bb=1;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
}
coalescedWrite(Vview_p[point][ss](b),res);
});
tmult2-=usecond();
accelerator_for(sb, osites*nbasis, Nsimd, {
int ss = sb/nbasis;
int b = sb%nbasis;
auto res = coalescedRead(Vview_p[0][ss](b));
for(int point=1;point<npoint;point++){
res = res + coalescedRead(Vview_p[point][ss](b));
}
coalescedWrite(out_v[ss](b),res);
});
tmult2+=usecond();
tmult+=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h[p].ViewClose();
AcceleratorVecViewContainer_h[p].ViewClose();
}
}
text-=usecond();
out = Cell.Extract(pout);
text+=usecond();
ttot+=usecond();
std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogPerformance<<" of which mult2 "<<tmult2<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
// std::cout << GridLogPerformance<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
void PopulateAdag(void)
{
for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
Coordinate bcoor;
CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
for(int p=0;p<geom.npoint;p++){
Coordinate scoor = bcoor;
for(int mu=0;mu<bcoor.size();mu++){
int L = CoarseGrid()->GlobalDimensions()[mu];
scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
}
// Flip to poke/peekLocalSite and not too bad
auto link = peekSite(_A[p],scoor);
int pp = geom.Reverse(p);
pokeSite(adj(link),_Adag[pp],bcoor);
}
}
}
/////////////////////////////////////////////////////////////
//
// A) Only reduced flops option is to use a padded cell of depth 4
// and apply MpcDagMpc in the padded cell.
//
// Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
// With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
// Cost is 81x more, same as stencil size.
//
// But: can eliminate comms and do as local dirichlet.
//
// Local exchange gauge field once.
// Apply to all vectors, local only computation.
// Must exchange ghost subcells in reverse process of PaddedCell to take inner products
//
// B) Can reduce cost: pad by 1, apply Deo (4^4+6^4+8^4+8^4 )/ (4x 4^4)
// pad by 2, apply Doe
// pad by 3, apply Deo
// then break out 8x directions; cost is ~10x MpcDagMpc per vector
//
// => almost factor of 10 in setup cost, excluding data rearrangement
//
// Intermediates -- ignore the corner terms, leave approximate and force Hermitian
// Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
/////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////
// BFM HDCG style approach: Solve a system of equations to get Aij
//////////////////////////////////////////////////////////
/*
* Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
*
* conj(phases[block]) proj[k][ block*Nvec+j ] = \sum_ball e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} >
* = \sum_ball e^{iqk.delta} A_ji
*
* Must invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*/
#if 0
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
CoarseVector coarseInner(CoarseGrid());
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
tphase-=usecond();
CoarseComplexField coor(CoarseGrid());
CoarseComplexField pha(CoarseGrid()); pha=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha =exp(pha*ci);
phaV=Zero();
blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
tphase+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
CoarseVector coarseInner(CoarseGrid());
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
tphase=-usecond();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid());
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
// std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
}
for(int p=0;p<geom.npoint;p++){
std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#endif
void ExchangeCoarseLinks(void){
for(int p=0;p<geom.npoint;p++){
_A[p] = Cell.ExchangePeriodic(_A[p]);
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
}
}
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,729 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef typename CComplex::scalar_object SComplex;
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef iVector<SComplex,nbasis > calcVector;
typedef iMatrix<SComplex,nbasis > calcMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef Lattice<CComplex > FineComplexField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
GridCartesian * _CoarseGridMulti;
NonLocalStencilGeometry geom;
NonLocalStencilGeometry geom_srhs;
PaddedCell Cell;
GeneralLocalStencil Stencil;
deviceVector<calcVector> BLAS_B;
deviceVector<calcVector> BLAS_C;
std::vector<deviceVector<calcMatrix> > BLAS_A;
std::vector<deviceVector<ComplexD *> > BLAS_AP;
std::vector<deviceVector<ComplexD *> > BLAS_BP;
deviceVector<ComplexD *> BLAS_CP;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
// Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A)
{
assert(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]);
}
void GetMatrix (int p,CoarseMatrix & A)
{
assert(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]);
}
void CopyMatrix (GeneralCoarseOp &_Op)
{
for(int p=0;p<geom.npoint;p++){
auto Aup = _Op.Cell.Extract(_Op._A[p]);
//Unpadded
GridtoBLAS(Aup,BLAS_A[p]);
}
}
/*
void CheckMatrix (GeneralCoarseOp &_Op)
{
std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
for(int p=0;p<geom.npoint;p++){
//Unpadded
auto Aup = _Op.Cell.Extract(_Op._A[p]);
auto Ack = Aup;
BLAStoGrid(Ack,BLAS_A[p]);
std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
}
std::cout <<"************* "<<std::endl;
}
*/
MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
_CoarseGridMulti(CoarseGridMulti),
geom_srhs(_geom),
geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
Cell(geom.Depth(),_CoarseGridMulti),
Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
{
int32_t padded_sites = Cell.grids.back()->lSites();
int32_t unpadded_sites = CoarseGridMulti->lSites();
int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
int32_t orhs = nrhs/CComplex::Nsimd();
padded_sites = padded_sites/nrhs;
unpadded_sites = unpadded_sites/nrhs;
/////////////////////////////////////////////////
// Device data vector storage
/////////////////////////////////////////////////
BLAS_A.resize(geom.npoint);
for(int p=0;p<geom.npoint;p++){
BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
}
BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
BLAS_AP.resize(geom.npoint);
BLAS_BP.resize(geom.npoint);
for(int p=0;p<geom.npoint;p++){
BLAS_AP[p].resize(unpadded_sites);
BLAS_BP[p].resize(unpadded_sites);
}
BLAS_CP.resize(unpadded_sites);
/////////////////////////////////////////////////
// Pointers to data
/////////////////////////////////////////////////
// Site identity mapping for A
for(int p=0;p<geom.npoint;p++){
for(int ss=0;ss<unpadded_sites;ss++){
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
acceleratorPut(BLAS_AP[p][ss],ptr);
}
}
// Site identity mapping for C
for(int ss=0;ss<unpadded_sites;ss++){
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
acceleratorPut(BLAS_CP[ss],ptr);
}
// Neighbour table is more complicated
int32_t j=0; // Interior point counter (unpadded)
for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
int ghost_zone=0;
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
}
}
if( ghost_zone==0) {
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
assert(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
}
j++;
}
}
assert(j==unpadded_sites);
}
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid();
assert(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension;
to.resize(Fg->lSites());
Coordinate LocalLatt = Fg->LocalDimensions();
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
////////////////////////////////////////////////////////////////////////////////////////////////
// do the index calc on the GPU
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
autoView(from_v,from,AcceleratorRead);
auto to_v = &to[0];
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{
Coordinate from_coor, base;
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
for(int i=0;i<nd;i++){
from_coor[i] = base[i];
}
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx];
scalar_type* to = (scalar_type *)&to_v[idx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
to[w] = stmp;
}
});
}
template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid();
assert(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension;
assert(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
////////////////////////////////////////////////////////////////////////////////////////////////
// do the index calc on the GPU
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
autoView(to_v,grid,AcceleratorWrite);
auto from_v = &in[0];
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{
Coordinate to_coor, base;
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
for(int i=0;i<nd;i++){
to_coor[i] = base[i];
}
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type* from = (scalar_type *)&from_v[idx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp=from[w];
putlane(to[w], stmp, to_lane);
}
});
}
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace,
GridBase *CoarseGrid)
{
#if 0
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
phaV = phaF[p]*Subspace.subspace[i];
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
linop.Op(phaV,MphaV);
// Fixme, could use batched block projector here
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
for(int k=0;k<npoint;k++){
FT = Zero();
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
/*
Grid : Message : 11698.730546 s : CoarsenOperator eigen 1334 us
Grid : Message : 11698.730563 s : CoarsenOperator phase 34729 us
Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
Grid : Message : 11698.730566 s : CoarsenOperator mat 127890998 us
Grid : Message : 11698.730567 s : CoarsenOperator proj 515840840 us
Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us
Takes 600s to compute matrix elements, DOMINATED by the block project.
Easy to speed up with the batched block project.
Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
// Block project below taks to 240s
Grid : Message : 328.193418 s : CoarsenOperator phase 38338 us
Grid : Message : 328.193434 s : CoarsenOperator phaseBZ 1711226 us
Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
//Grid : Message : 328.193438 s : CoarsenOperator proj 1181154 us <-- this is mistimed
//Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us <-- Cut this ~10x if lucky by loop fusion
*/
#else
RealD tproj=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
MultiRHSBlockProject<Lattice<Fobj> > Projector;
Projector.Allocate(nbasis,grid,CoarseGrid);
Projector.ImportBasis(Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
tphase=-usecond();
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
// Count use small chunks than npoint == 81 and save memory
int batch = 9;
std::vector<FineField> _MphaV(batch,grid);
std::vector<CoarseVector> TmpProj(batch,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
// std::cout << GridLogMessage << " phasing the fine vector "<<std::endl;
// Fixme : do this in batches
for(int p=0;p<npoint;p+=batch){ // Loop over momenta in npoint
for(int b=0;b<MIN(batch,npoint-p);b++){
tphaseBZ-=usecond();
phaV = phaF[p+b]*Subspace.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
// Memory footprint was an issue
tmat-=usecond();
linop.Op(phaV,MphaV);
_MphaV[b] = MphaV;
tmat+=usecond();
}
// std::cout << GridLogMessage << " Calling block project "<<std::endl;
tproj-=usecond();
Projector.blockProject(_MphaV,TmpProj);
tproj+=usecond();
// std::cout << GridLogMessage << " conj phasing the coarse vectors "<<std::endl;
for(int b=0;b<MIN(batch,npoint-p);b++){
ComputeProj[p+b] = conjugate(pha[p+b])*TmpProj[b];
}
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
// std::cout << GridLogMessage << " Starting FT inv "<<std::endl;
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT = Zero();
// 81 kernel calls as many ComputeProj vectors
// Could fuse with a vector of views, but ugly
// Could unroll the expression and run fewer kernels -- much more attractive
// Could also do non blocking.
#if 0
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#else
const int radix = 9;
int ll;
for(ll=0;ll+radix-1<npoint;ll+=radix){
// When ll = npoint-radix, ll+radix-1 = npoint-1, and we do it all.
FT = FT
+ invMkl(ll+0,k)*ComputeProj[ll+0]
+ invMkl(ll+1,k)*ComputeProj[ll+1]
+ invMkl(ll+2,k)*ComputeProj[ll+2]
+ invMkl(ll+3,k)*ComputeProj[ll+3]
+ invMkl(ll+4,k)*ComputeProj[ll+4]
+ invMkl(ll+5,k)*ComputeProj[ll+5]
+ invMkl(ll+6,k)*ComputeProj[ll+6]
+ invMkl(ll+7,k)*ComputeProj[ll+7]
+ invMkl(ll+8,k)*ComputeProj[ll+8];
}
for(int l=ll;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#endif
// 1 kernel call -- must be cheaper
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
// std::cout << GridLogMessage << " Calling GridtoBLAS "<<std::endl;
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
#endif
}
void Mdag(const CoarseVector &in, CoarseVector &out)
{
this->M(in,out);
}
void M (const CoarseVector &in, CoarseVector &out)
{
// std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
RealD t_tot;
RealD t_exch;
RealD t_GtoB;
RealD t_BtoG;
RealD t_mult;
t_tot=-usecond();
CoarseVector tin=in;
t_exch=-usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
t_exch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef calcMatrix* Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
assert(nrhs>=1);
RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded
int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+ 2.0*osites*sizeof(siteVector)*npoint;
t_GtoB=-usecond();
GridtoBLAS(pin,BLAS_B);
t_GtoB+=usecond();
GridBLAS BLAS;
t_mult=-usecond();
for(int p=0;p<geom.npoint;p++){
RealD c = 1.0;
if (p==0) c = 0.0;
ComplexD beta(c);
BLAS.gemmBatched(nbasis,nrhs,nbasis,
ComplexD(1.0),
BLAS_AP[p],
BLAS_BP[p],
ComplexD(c),
BLAS_CP);
}
BLAS.synchronise();
t_mult+=usecond();
t_BtoG=-usecond();
BLAStoGrid(out,BLAS_C);
t_BtoG+=usecond();
t_tot+=usecond();
/*
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
*/
// std::cout << GridLogMessage<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,238 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////
// Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
/////////////////////////////////////////////////////////////////
// Less local equivalent of Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class NonLocalStencilGeometry {
public:
// int depth;
int skip;
int hops;
int npoint;
std::vector<Coordinate> shifts;
Coordinate stencil_size;
Coordinate stencil_lo;
Coordinate stencil_hi;
GridCartesian *grid;
GridCartesian *Grid() {return grid;};
int Depth(void){return 1;}; // Ghost zone depth
int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
int DimSkip(void){return skip;};
virtual ~NonLocalStencilGeometry() {};
int Reverse(int point)
{
int Nd = Grid()->Nd();
Coordinate shft = shifts[point];
Coordinate rev(Nd);
for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
for(int p=0;p<npoint;p++){
if(rev==shifts[p]){
return p;
}
}
assert(0);
return -1;
}
void BuildShifts(void)
{
this->shifts.resize(0);
int Nd = this->grid->Nd();
int dd = this->DimSkip();
for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
Coordinate sft(Nd,0);
sft[dd+0] = s0;
sft[dd+1] = s1;
sft[dd+2] = s2;
sft[dd+3] = s3;
int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
if(nhops<=this->hops) this->shifts.push_back(sft);
}}}}
this->npoint = this->shifts.size();
std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
}
NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
{
Coordinate latt = grid->GlobalDimensions();
stencil_size.resize(grid->Nd());
stencil_lo.resize(grid->Nd());
stencil_hi.resize(grid->Nd());
for(int d=0;d<grid->Nd();d++){
if ( latt[d] == 1 ) {
stencil_lo[d] = 0;
stencil_hi[d] = 0;
stencil_size[d]= 1;
} else if ( latt[d] == 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 0;
stencil_size[d]= 2;
} else if ( latt[d] > 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 1;
stencil_size[d]= 3;
}
}
this->BuildShifts();
};
};
// Need to worry about red-black now
class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
public:
virtual int DerivedDimSkip(void) { return 0;};
NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
virtual ~NonLocalStencilGeometry4D() {};
};
class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
public:
virtual int DerivedDimSkip(void) { return 1; };
NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1) { };
virtual ~NonLocalStencilGeometry5D() {};
};
/*
* Bunch of different options classes
*/
class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,4)
{
};
};
class NextToNextToNextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,4)
{
};
};
class NextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,2)
{
};
};
class NextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,2)
{
};
};
class NearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,1)
{
};
};
class NearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,1)
{
};
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid/algorithms/multigrid/MultiGrid.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/multigrid/Aggregates.h>
#include <Grid/algorithms/multigrid/Geometry.h>
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>

View File

@@ -54,6 +54,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@@ -66,7 +69,7 @@ public:
}
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
@@ -100,6 +103,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@@ -145,6 +151,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@@ -165,18 +174,48 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
/*
template<class T> class vecView
{
protected:
T * data;
uint64_t size;
ViewMode mode;
void * cpu_ptr;
public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(Vector<T> &refer_to_me,ViewMode _mode)
{
cpu_ptr = &refer_to_me[0];
size = refer_to_me.size();
mode = _mode;
data =(T *) MemoryManager::ViewOpen(cpu_ptr,
size*sizeof(T),
mode,
AdviseDefault);
}
void ViewClose(void)
{ // Inform the manager
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{
vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed
}
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid);

View File

@@ -4,15 +4,56 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#define CpuHuge (1)
#define CpuSmall (2)
#define Acc (3)
#define AccHuge (4)
#define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -32,15 +73,18 @@ void MemoryManager::PrintBytes(void)
#ifdef GRID_CUDA
cuda_mem();
#endif
DisplayMallinfo();
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
@@ -170,6 +214,16 @@ void MemoryManager::Init(void)
}
}
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
@@ -190,7 +244,9 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif
#ifdef GRID_UVM
@@ -222,8 +278,11 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
@@ -232,11 +291,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
if (ncache == 0) return ptr;
void * ret = NULL;
int v = -1;
@@ -271,8 +331,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
@@ -281,7 +344,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif

View File

@@ -35,6 +35,12 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
#define AUDIT(a) MemoryManager::Audit(FILE_LINE)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
@@ -65,6 +71,21 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now
};
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager {
private:
@@ -78,7 +99,7 @@ private:
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
@@ -92,8 +113,9 @@ private:
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
static void PrintBytes(void);
static void Audit(std::string s);
static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
@@ -113,6 +135,27 @@ private:
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private:
#ifndef GRID_UVM
@@ -166,10 +209,12 @@ private:
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr);
static void Print(void);
static void PrintAll(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);

View File

@@ -1,11 +1,15 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
#define MAXLINE 512
static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...)
//#define mprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
@@ -23,6 +27,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
////////////////////////////////////
// Priority ordering for unlocked entries
@@ -104,15 +110,17 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@@ -121,26 +129,36 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
// Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
// Cannot be acclocked. If allocated must be in LRU pool.
//
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
// but there is a weakness where CpuLock entries are attempted for erase
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return;
if (AccCache.cpuLock!=0) return;
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
// EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
@@ -150,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@@ -165,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
@@ -191,6 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
@@ -202,6 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -212,13 +234,19 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
} else {
return;
}
}
}
@@ -241,11 +269,12 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
(uint64_t)bytes,
(uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
@@ -280,6 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
@@ -292,28 +322,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache);
}
@@ -334,10 +366,12 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -374,9 +408,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
// CPU doesn't need to free space
// if (!AccCache.AccPtr) {
// EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
@@ -430,20 +465,29 @@ void MemoryManager::NotifyDeletion(void *_ptr)
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Memory Manager " << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
acceleratorMem();
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}
void MemoryManager::PrintAll(void)
{
Print();
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
@@ -453,13 +497,13 @@ void MemoryManager::Print(void)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
@@ -473,6 +517,63 @@ int MemoryManager::isOpen (void* _CpuPtr)
return 0;
}
}
void MemoryManager::Audit(std::string s)
{
uint64_t CpuBytes=0;
uint64_t AccBytes=0;
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
CpuBytes+=AccCache.bytes;
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock
<< "\t accLock " << AccCache.accLock
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
}
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
void MemoryManager::PrintState(void* _CpuPtr)
{
@@ -489,8 +590,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;

View File

@@ -12,7 +12,10 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
void MemoryManager::Audit(std::string s){};
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
@@ -21,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr)
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::PrintAll(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);

View File

@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;

View File

@@ -70,8 +70,8 @@ public:
Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
int64_t _fsites; // _isites*_osites = product(dimensions).
int64_t _gsites;
Coordinate _slice_block;// subslice information
Coordinate _slice_stride;
Coordinate _slice_nblock;
@@ -82,6 +82,7 @@ public:
bool _isCheckerBoarded;
int LocallyPeriodic;
Coordinate _checker_dim_mask;
int _checker_dim;
public:
@@ -183,7 +184,7 @@ public:
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
@@ -214,7 +215,7 @@ public:
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
@@ -222,7 +223,7 @@ public:
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {

View File

@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public:
int dummy;
Coordinate _checker_dim_mask;
// Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
@@ -106,6 +106,7 @@ public:
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension);
_lend.resize(_ndimension);

View File

@@ -57,9 +57,10 @@ class GridRedBlackCartesian : public GridBase
{
public:
// Coordinate _checker_dim_mask;
int _checker_dim;
// int _checker_dim;
std::vector<int> _checker_board;
virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;

View File

@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);

View File

@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
@@ -53,10 +55,11 @@ public:
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension;
Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processor_coor; // linear processor coordinate
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
@@ -97,6 +100,7 @@ public:
int BossRank(void) ;
int ThisRank(void) ;
const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ShmGrid(void) { return _shm_processors; } ;
const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ;
@@ -105,6 +109,7 @@ public:
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
static void BarrierWorld(void);
////////////////////////////////////////////////////////////
// Reduction
@@ -125,16 +130,53 @@ public:
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
scalar_type * ptr = (scalar_type *)& o; // Safe alias
GlobalSumVector(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir);
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
@@ -142,17 +184,28 @@ public:
int bytes);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,
int recv_from_rank,int do_recv,
int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,
int bytes,int dir);
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);

View File

@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
@@ -106,7 +107,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
@@ -124,12 +125,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
shm_processors [pad+d]=parent._shm_processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -154,6 +156,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@@ -255,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
}
}
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
@@ -285,25 +307,54 @@ void CartesianCommunicator::GlobalMax(double &d)
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
MPI_Request xrq;
MPI_Request rrq;
assert(dest != _processor);
assert(from != _processor);
int tag;
tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
@@ -311,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
std::vector<MpiCommsRequest_t> reqs(0);
int myrank = _processor;
int ierr;
@@ -329,29 +378,40 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
int dest, int dox,
void *recv,
int from,
int from, int dor,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
int dest,int dox,
void *recv,
int from,
int bytes,int dir)
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
@@ -370,48 +430,368 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
double off_node_bytes=0.0;
int tag;
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
off_node_bytes+=xbytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
// std::cout << "Copy Synchronised\n"<<std::endl;
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
@@ -438,6 +818,10 @@ int CartesianCommunicator::RankWorld(void){
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BarrierWorld(void){
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,

View File

@@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
_shm_processors = Coordinate(processors.size(),1);
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_shm_processors = Coordinate(processors.size(),1);
_processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension);
@@ -89,6 +91,17 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{
assert(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
@@ -102,6 +115,7 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@@ -111,21 +125,32 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,
int recv_from_rank,int dor,
int bytes, int dir)
{
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,
int bytes, int dir)
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 2.0*bytes;
return xbytes+rbytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{

View File

@@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
void * GlobalSharedMemory::HostCommBuf;
#endif
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
@@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
void *SharedMemory::HostBufferMalloc(size_t bytes){
void *ptr = (void *)host_heap_top;
host_heap_top += bytes;
host_heap_bytes+= bytes;
if (host_heap_bytes >= host_heap_size) {
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size);
}
return ptr;
}
void SharedMemory::HostBufferFreeAll(void) {
host_heap_top =(size_t)HostCommBuf;
host_heap_bytes=0;
}
#endif
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
@@ -91,6 +114,59 @@ void *SharedMemory::ShmBufferSelf(void)
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
NAMESPACE_END(Grid);

View File

@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
@@ -75,7 +107,9 @@ public:
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
static void *HostCommBuf;
#endif
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
@@ -93,16 +127,17 @@ public:
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
// Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};
@@ -119,6 +154,13 @@ private:
size_t heap_bytes;
size_t heap_size;
#ifndef ACCELERATOR_AWARE_MPI
size_t host_heap_top; // set in free all
size_t host_heap_bytes;// set in free all
void *HostCommBuf; // set in SetCommunicator
size_t host_heap_size; // set in SetCommunicator
#endif
protected:
Grid_MPI_Comm ShmComm; // for barriers
@@ -150,7 +192,10 @@ public:
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
#ifndef ACCELERATOR_AWARE_MPI
void *HostBufferMalloc(size_t bytes);
void HostBufferFreeAll(void);
#endif
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////

View File

@@ -27,6 +27,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
*************************************************************************************/
/* END LEGAL */
#define Mheader "SharedMemoryMpi: "
#include <Grid/GridCore.h>
#include <pwd.h>
@@ -36,12 +38,127 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#ifdef GRID_SYCL
#ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif
#include <syscall.h>
#endif
#include <sys/socket.h>
#include <sys/un.h>
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
#ifdef SHM_SOCKETS
/*
* Barbaric extra intranode communication route in case we need sockets to pass FDs
* Forced by level_zero not being nicely designed
*/
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
sendmsg(sock, &msg, 0);
};
};
#endif
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
@@ -64,8 +181,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
@@ -152,7 +269,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -165,63 +282,11 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
@@ -294,6 +359,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
Coordinate HyperCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM = ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
@@ -341,7 +407,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
@@ -353,6 +419,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM=ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
@@ -391,7 +459,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
@@ -450,46 +518,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
@@ -513,22 +541,61 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef SHM_SOCKETS
UnixSockets::Open(WorldShmRank);
#endif
for(int r=0;r<WorldShmSize;r++){
MPI_Barrier(WorldShmComm);
#ifndef GRID_MPI3_SHM_NONE
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
@@ -536,10 +603,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
@@ -547,13 +614,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
#ifdef SHM_SOCKETS
for(int rr=0;rr<WorldShmSize;rr++){
if(rr!=r){
UnixSockets::SendFileDescriptor(handle.fd,rr);
}
}
#endif
}
#endif
#ifdef GRID_CUDA
@@ -581,6 +656,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
MPI_Barrier(WorldShmComm);
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
@@ -596,6 +672,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
int myfd;
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
@@ -603,16 +683,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
if (myfd < 0) {
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
perror("pidfd_getfd failed ");
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@@ -647,18 +733,18 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
MPI_Barrier(WorldShmComm);
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#else
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -695,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@@ -705,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -752,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
@@ -830,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes);
//#else
// bcopy(src,dest,bytes);
//#endif
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@@ -873,9 +959,16 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
}
ShmBufferFreeAll();
#ifndef ACCELERATOR_AWARE_MPI
host_heap_size = heap_size;
HostCommBuf= GlobalSharedMemory::HostCommBuf;
HostBufferFreeAll();
#endif
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
@@ -897,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
#endif
//SharedMemoryTest();
SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@@ -919,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
}
void *SharedMemory::ShmBuffer(int rank)

View File

@@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
optimal_comm = WorldComm;
SHM = Coordinate(processors.size(),1);
}
////////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif
NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{

View File

@@ -29,13 +29,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
extern std::vector<std::pair<int,int> > Cshift_table;
extern deviceVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void)
{
// GPU version
uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz);
}
acceleratorCopyToDevice((void *)&Cshift_table[0],
(void *)&Cshift_table_device[0],
sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0];
// CPU version use identify map
}
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@@ -74,18 +89,11 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
}
{
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@@ -110,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@@ -121,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@@ -156,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset);
}
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@@ -225,18 +201,11 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
}
@@ -259,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@@ -268,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
@@ -340,20 +300,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@@ -392,20 +344,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
}

View File

@@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
@@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
RealD t1,t0;
t0=usecond();
if ( !comm_dim ) {
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
t1=usecond();
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
@@ -91,7 +94,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@@ -101,8 +104,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
@@ -122,21 +123,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
@@ -144,26 +153,52 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond();
}
}
if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -190,6 +225,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
@@ -198,8 +239,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
@@ -207,6 +248,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int bytes = buffer_size*sizeof(scalar_object);
@@ -227,7 +272,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
tgather-=usecond();
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
@@ -252,216 +299,51 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
}
#endif
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,5 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
std::vector<std::pair<int,int> > Cshift_table;
deviceVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid);

File diff suppressed because it is too large Load Diff

View File

@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
@@ -46,4 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/PaddedCell.h>

View File

@@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
// typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
@@ -345,7 +345,9 @@ GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnarySpTa, SpTa(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
@@ -456,7 +458,9 @@ GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(SpTa, UnarySpTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the

View File

@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = lhs.Checkerboard();
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
@@ -53,6 +54,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
@@ -70,6 +72,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
@@ -86,6 +89,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("add");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
@@ -106,6 +110,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite);
@@ -119,6 +124,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite);
@@ -133,6 +139,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite);
@@ -146,6 +153,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("add");
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite);
@@ -163,6 +171,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@@ -177,6 +186,7 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@@ -191,6 +201,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@@ -204,6 +215,7 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("add");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@@ -218,6 +230,7 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
GRID_TRACE("axpy");
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
@@ -231,6 +244,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
}
template<class sobj,class vobj> inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
GRID_TRACE("axpby");
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
@@ -243,16 +257,68 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
});
}
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
}
/// Trace product
template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
-> Lattice<decltype(trace(obj()))>
{
typedef decltype(trace(obj())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( rhs2 , rhs_2, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
-> Lattice<decltype(trace(obj1()))>
{
typedef decltype(trace(obj1())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
-> Lattice<decltype(trace(obj1()))>
{
return traceProduct(rhs_1,rhs_2);
}
NAMESPACE_END(Grid);
#endif

View File

@@ -117,6 +117,7 @@ public:
////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
@@ -140,6 +141,7 @@ public:
}
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
@@ -163,6 +165,7 @@ public:
}
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
@@ -231,10 +234,23 @@ public:
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 0
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp);
});
#else
auto me = View(CpuWrite);
thread_for(ss,me.size(),{
me[ss]= r;
});
#endif
me.ViewClose();
return *this;
}
@@ -288,8 +304,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
@@ -303,8 +319,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
@@ -357,7 +373,7 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
for(int64_t g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

View File

@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite));
h_basis_v[k] = basis[k].View(AcceleratorWrite);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
}
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
View *basis_vp = &d_basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot);
deviceVector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm);
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
h_Qt_jv[i]=Qt(j,k);
});
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
}
// Extract a single rotated vector
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size());
hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead));
h_basis_v[k]=basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
vobj zz=Zero();
deviceVector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
auto basis_vp=& d_basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero();
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
}
template<class Field>

View File

@@ -29,13 +29,26 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
{
auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1;
typedef typename vobj::tensor_reduced normtype;
typedef typename normtype::scalar_object scalar;
std::vector<scalar> sff;
sliceSum(ff,sff,mu);
for(int t=0;t<sff.size();t++){
std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
}
}
template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@@ -32,7 +32,6 @@ template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -82,7 +81,6 @@ template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@@ -130,7 +128,6 @@ template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();

View File

@@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@@ -125,14 +122,17 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj>
typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
typename vobj::scalar_object s;
peekSite(s,l,site);
return s;
}
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site));
// assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,13 +173,13 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l[odx];
const vector_type *vp = (const vector_type *) &l[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
pt[w] = getlane(vp[w],idx);
}
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return;
};
template<class vobj,class sobj>
@@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site));
// assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l[odx];
vector_type * vp = (vector_type *)&l[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
putlane(vp[w],pt[w],idx);
}
return;
};

View File

@@ -28,6 +28,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
#include <Grid/lattice/Lattice_slicesum_core.h>
NAMESPACE_BEGIN(Grid);
@@ -42,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
@@ -71,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
@@ -91,10 +95,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
return ssum;
}
/*
Threaded max, don't use for now
@@ -127,7 +128,7 @@ inline Double max(const Double *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
@@ -136,25 +137,61 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
{
Integer osites = arg.Grid()->oSites();
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
return sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
return sum_cpu(&arg_v[0],osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
auto ssum = rankSum(arg);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites);
return sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
return sum_cpu(&arg_v[0],osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
auto ssum = rankSumLarge(arg);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
@@ -167,6 +204,27 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm);
}
template<class Op,class T1>
inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr) ->RealD
{
return norm2(closure(expr));
}
template<class Op,class T1,class T2>
inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr) ->RealD
{
return norm2(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) ->RealD
{
return norm2(closure(expr));
}
//The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{
@@ -197,7 +255,6 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
ComplexD nrm;
@@ -207,8 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites();
// Might make all code paths go this way.
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
deviceVector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
@@ -216,24 +273,63 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, 1,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
});
}
// This is in single precision and fails some tests
auto anrm = sum(inner_tmp_v,sites);
auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm;
return nrm;
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// Hack
// Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
#endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm);
ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@@ -257,8 +353,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
// typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x.Grid();
@@ -270,18 +365,54 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, 1,{
auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@@ -290,9 +421,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
{
conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2);
std::vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
@@ -302,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
deviceVector<inner_t> inner_tmp(sites);
deviceVector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
@@ -353,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
@@ -375,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
std::vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
@@ -387,19 +519,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
int ostride=grid->_ostride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
//Reduce Data down to lvSum
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
@@ -433,8 +556,32 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
}
template<class vobj> inline
std::vector<typename vobj::scalar_object>
sliceSum(const Lattice<vobj> &Data,int orthogdim)
{
std::vector<typename vobj::scalar_object> result;
sliceSum(Data,result,orthogdim);
return result;
}
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
@@ -454,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -539,6 +686,7 @@ template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0)
{
// perhaps easier to just promote A to a field and use regular madd
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
@@ -569,8 +717,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*zscale;
av.putlane(scalar_type(a[ldx])*zscale,l);
}
tensor_reduced at; at=av;
@@ -585,206 +732,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
std::vector<int> latt_phys(NN-1);
Coordinate simd_phys;
std::vector<int> mpi_phys(NN-1);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
mpi_phys[dd] =BlockSolverGrid->_processors[d];
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
RR = R; // Copies checkerboard for insert
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type;
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
for(int i=0;i<Nslice;i++){
ExtractSlice(Ys,Y,i,Orthog);
ExtractSlice(Rs,R,i,Orthog);
Rs=Ys;
for(int j=0;j<Nslice;j++){
ExtractSlice(Xs,X,j,Orthog);
Rs = Rs + Xs*(scale*aa(j,i));
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
InsertSlice(Rs,RR,i,Orthog);
}
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
R=Zero();
sliceMaddMatrix(R,aa,X,R,Orthog,scale);
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
for(int s=0;s<Nslice;s++){
ExtractSlice(ls,lhs,s,Orthog);
for(int ss=0;ss<Nslice;ss++){
ExtractSlice(rs,rhs,ss,Orthog);
mat(s,ss) = innerProduct(ls,rs);
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
delete SliceGrid;
}
NAMESPACE_END(Grid);

View File

@@ -23,27 +23,27 @@ unsigned int nextPow2(Iterator x) {
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
#ifdef GRID_CUDA
cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
auto r=hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
/*
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
*/
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
@@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
exit(EXIT_FAILURE);
return 0;
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
return 1;
}
template <class sobj, class Iterator>
@@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
@@ -207,17 +207,67 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
deviceVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
auto result = buffer_v[0];
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
return result;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
scalarD *ret_p = (scalarD *)&ret;
const int words = sizeof(vobj)/sizeof(vector);
deviceVector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
sobj ret;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
if ( ok ) {
ret = sumD_gpu_small(lat,osites);
} else {
ret = sumD_gpu_large(lat,osites);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -230,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,92 @@
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj identity; zeroit(identity);
sobj ret; zeroit(ret);
Integer nsimd= vobj::Nsimd();
{
sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(sycl::range<1>{osites},
Reduction,
[=] (sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
}
sobjD dret; convertType(dret,ret);
return dret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
return sumD_gpu_tensor(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index];
});
});
}
theGridAccelerator->wait();
return ret;
}
NAMESPACE_END(Grid);

View File

@@ -32,9 +32,8 @@
#include <random>
#ifdef RNG_SITMO
#include <Grid/random/sitmo_prng_engine.hpp>
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif
#include <Grid/random/gaussian.h>
#if defined(RNG_SITMO)
#define RNG_FAST_DISCARD
@@ -143,8 +142,8 @@ public:
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<Grid::gaussian_distribution<RealD> > _gaussian;
// std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
///////////////////////
@@ -153,6 +152,7 @@ public:
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
#if 0
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
@@ -180,6 +180,9 @@ public:
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
#else
eng.discardhi(site);
#endif
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
@@ -244,8 +247,8 @@ public:
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
// _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
}
@@ -358,13 +361,18 @@ public:
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
// _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
{
if ( l.Grid()->_isCheckerBoarded ) {
Lattice<vobj> tmp(_grid);
fill(tmp,dist);
pickCheckerboard(l.Checkerboard(),l,tmp);
return;
}
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
@@ -408,7 +416,7 @@ public:
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@@ -425,22 +433,29 @@ public:
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
thread_for( lidx, _grid->lSites(), {
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int64_t gidx;
int o_idx;
int i_idx;
int rank;
Coordinate pcoor;
Coordinate lcoor;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->LocalIndexToLocalCoor(lidx,lcoor);
pcoor=_grid->ThisProcessorCoor();
_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
if ( britney ) {
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
} else {
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
@@ -516,11 +531,11 @@ public:
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,267 @@
#pragma once
#if defined(GRID_CUDA)
#include <cub/cub.cuh>
#define gpucub cub
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#elif defined(GRID_HIP)
#include <hipcub/hipcub.hpp>
#define gpucub hipcub
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#endif
NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj>
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2;
deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
vobj zero_init;
zeroit(zero_init);
void *temp_storage_array = NULL;
size_t temp_storage_bytes = 0;
vobj *d_out;
int* d_offsets;
std::vector<int> offsets(rd+1,0);
for (int i = 0; i < offsets.size(); i++) {
offsets[i] = i*subvol_size;
}
//Allocate memory for output and offset arrays on device
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
//allocate memory for temp_storage_array
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
//prepare buffer for reduction
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
//issue segmented reductions in computeStream
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();
acceleratorFreeDevice(temp_storage_array);
acceleratorFreeDevice(d_out);
acceleratorFreeDevice(d_offsets);
}
#endif
#if defined(GRID_SYCL)
template<class vobj>
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{
size_t subvol_size = e1*e2;
vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
vobj vobj_zero;
zeroit(vobj_zero);
for (int r = 0; r<rd; r++) {
mysum[r] = vobj_zero;
}
deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
// autoView(Data_v, Data, AcceleratorRead);
//prepare reduction buffer
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction,
[=](sycl::id<1> item, auto &sum) {
auto s = item[0];
sum += rb_p[r*subvol_size+s];
});
});
}
theGridAccelerator->wait();
for (int r = 0; r < rd; r++) {
lvSum[r] = mysum[r];
}
free(mysum,*theGridAccelerator);
}
#endif
template<class vobj>
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2;
deviceVector<vector>buffer(osites);
vector *dat = (vector *)Data;
vector *buf = &buffer[0];
std::vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
#elif defined(GRID_SYCL)
sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
#endif
for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r];
}
}
}
template<class vobj>
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#elif defined (GRID_SYCL)
sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
else {
sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
}
template<class vobj>
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*ostride; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
}
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
NAMESPACE_END(Grid);

View File

@@ -66,6 +66,65 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
return ret;
};
template<int N, class Vec>
Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
typedef typename Vec::scalar_type scalar;
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<scalar, N> > > Us;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
scalar tmp= Us()()(i,j);
ComplexD ztmp(real(tmp),imag(tmp));
EigenU(i,j)=ztmp;
}}
ComplexD detD = EigenU.determinant();
typename Vec::scalar_type det(detD.real(),detD.imag());
pokeLocalSite(det,ret_v,lcoor);
});
return ret;
}
template<int N>
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
EigenU(i,j) = Us()()(i,j);
}}
Eigen::MatrixXcd EigenUinv = EigenU.inverse();
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
Ui()()(i,j) = EigenUinv(i,j);
}}
pokeLocalSite(Ui,ret_v,lcoor);
});
return ret;
}
NAMESPACE_END(Grid);
#endif

View File

@@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
#endif
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
precisionChange(out,in);
}
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
precisionChange(out,in);
}
template<typename T1,typename T2>
@@ -276,8 +276,52 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
RealD t_IP=0;
RealD t_co=0;
RealD t_za=0;
for(int v=0;v<nbasis;v++) {
t_IP-=usecond();
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
t_IP+=usecond();
t_co-=usecond();
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
t_co+=usecond();
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
t_za-=usecond();
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
t_za+=usecond();
}
// std::cout << GridLogPerformance << " blockProject : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
}
// This only minimises data motion from CPU to GPU
// there is chance of better implementation that does a vxk loop of inner products to data share
// at the GPU thread level
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
@@ -285,10 +329,10 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -364,8 +408,15 @@ template<class vobj,class CComplex>
Lattice<dotp> coarse_inner(coarse);
// Precision promotion
RealD t;
t=-usecond();
fine_inner = localInnerProductD<vobj>(fineX,fineY);
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
t=-usecond();
blockSum(coarse_inner,fine_inner);
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
t=-usecond();
{
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@@ -373,6 +424,7 @@ template<class vobj,class CComplex>
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
});
}
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
}
@@ -415,6 +467,9 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
template<class vobj>
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
{
const int maxsubsec=256;
typedef iVector<vobj,maxsubsec> vSubsec;
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
@@ -442,16 +497,32 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
// Somewhat lazy calculation
// Find the biggest power of two subsection divisor less than or equal to maxsubsec
int subsec=maxsubsec;
int subvol;
subvol=blockVol/subsec;
while(subvol*subsec!=blockVol){
subsec = subsec/2;
subvol=blockVol/subsec;
};
Lattice<vSubsec> coarseTmp(coarse);
autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
auto coarseTmp_p= &coarseTmp_[0];
// Sum within subsecs in a first kernel
accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
int sc=sce/subsec;
int e=sce%subsec;
// One thread per sub block
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){
auto cd = coalescedRead(zz);
for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
int sf;
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
@@ -459,12 +530,21 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
cd=cd+fineData_p[sf];
cd=cd+coalescedRead(fineData_p[sf]);
}
coarseData_p[sc] = cd;
coalescedWrite(coarseTmp_[sc](e),cd);
});
// Sum across subsecs in a second kernel
accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
auto cd = coalescedRead(coarseTmp_p[sc](0));
for(int e=1;e<subsec;e++){
cd=cd+coalescedRead(coarseTmp_p[sc](e));
}
coalescedWrite(coarseData_p[sc],cd);
});
return;
}
@@ -521,7 +601,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
blockOrthonormalize(ip,Basis);
}
#if 0
#ifdef GRID_ACCELERATED
// TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@@ -547,26 +627,37 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
autoView( fineData_ , fineData, AcceleratorWrite);
autoView( coarseData_ , coarseData, AcceleratorRead);
typedef LatticeView<vobj> Vview;
std::vector<Vview> AcceleratorVecViewContainer_h;
for(int v=0;v<nbasis;v++) {
AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
}
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis);
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
auto Basis_p = &AcceleratorVecViewContainer[0];
// Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{
Coordinate frdimensions=fine->_rdimensions;
Coordinate crdimensions=coarse->_rdimensions;
accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
int sc;
Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
Lexicographic::CoorFromIndex(coor_f,sf,frdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
Lexicographic::IndexFromCoor(coor_c,sc,crdimensions);
for(int i=0;i<nbasis;i++) {
/* auto basis_ = Basis[i], );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
}
auto sum= coarseData_(sc)(0) *Basis_p[0](sf);
for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf);
coalescedWrite(fineData_[sf],sum);
});
for(int v=0;v<nbasis;v++) {
AcceleratorVecViewContainer_h[v].ViewClose();
}
return;
}
#else
// CPU version
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData,
@@ -590,6 +681,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
}
#endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj>
@@ -633,7 +744,11 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
static const int words=sizeof(vobj)/sizeof(vector_type);
const int words=sizeof(vobj)/sizeof(vector_type);
//////////////////////////////////////////////////////////////////////////////////////////
// checks should guarantee that the operations are local
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
@@ -649,43 +764,186 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
assert(Fg->_processors[d] == Tg->_processors[d]);
}
// the above should guarantee that the operations are local
Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride;
Coordinate osf = Fg->_ostride;
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
///////////////////////////////////////////////////////////
// do the index calc on the GPU
///////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
int in_region=1;
for(int d=0;d<nd;d++){
if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){
in_region=0;
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
Coordinate from_coor, to_coor, base;
Lexicographic::CoorFromIndex(base,idx,RegionSize);
for(int i=0;i<nd;i++){
from_coor[i] = base[i] + FromLowerLeft[i];
to_coor[i] = base[i] + ToLowerLeft[i];
}
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
}
if (in_region) {
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t];
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
}
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
}
template<class vobj>
void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int words=sizeof(vobj)/sizeof(vector_type);
//////////////////////////////////////////////////////////////////////////////////////////
// checks should guarantee that the operations are local
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
assert(nF+1 == nT);
///////////////////////////////////////////////////////////
// do the index calc on the GPU
///////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
Coordinate RegionSize = Fg->_ldimensions;
size_t nsite = 1;
for(int i=0;i<nF;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
Coordinate from_coor(nF), to_coor(nT);
Lexicographic::CoorFromIndex(from_coor,idx,RegionSize);
int j=0;
for(int i=0;i<nT;i++){
if ( i!=orthog ) {
to_coor[i] = from_coor[j];
j++;
} else {
to_coor[i] = slice;
}
}
int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
}
template<class vobj>
void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, int orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int words=sizeof(vobj)/sizeof(vector_type);
//////////////////////////////////////////////////////////////////////////////////////////
// checks should guarantee that the operations are local
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
assert(nT+1 == nF);
///////////////////////////////////////////////////////////
// do the index calc on the GPU
///////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
Coordinate RegionSize = Tg->_ldimensions;
size_t nsite = 1;
for(int i=0;i<nT;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
Coordinate from_coor(nF), to_coor(nT);
Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
int j=0;
for(int i=0;i<nF;i++){
if ( i!=orthog ) {
from_coor[i] = to_coor[j];
j++;
} else {
from_coor[i] = slice;
}
}
int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
}
template<class vobj>
void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
@@ -723,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice;
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++];
hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
}
}
peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor);
@@ -745,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
assert(orthog<nh);
assert(orthog>=0);
assert(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0;
for(int d=0;d<nh;d++){
@@ -762,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0;
hcoor[orthog] = slice;
int ddl=0;
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++];
hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
}
}
peekLocalSite(s,higherDimv,hcoor);
@@ -775,7 +1045,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
}
//Can I implement with local copyregion??
template<class vobj>
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
@@ -796,61 +1066,18 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor);
}
});
Coordinate sz = lg->_ldimensions;
sz[orthog]=1;
Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo;
Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi;
localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz);
}
template<class vobj>
void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
typedef typename vobj::scalar_object sobj;
GridBase *lg = lowDim.Grid();
GridBase *hg = higherDim.Grid();
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl == nh);
assert(orthog<nh);
assert(orthog>=0);
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDimv,lcoor);
}
});
InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog);
}
@@ -876,7 +1103,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
Coordinate fcoor(nd);
Coordinate ccoor(nd);
for(int g=0;g<fg->gSites();g++){
for(int64_t g=0;g<fg->gSites();g++){
fg->GlobalIndexToGlobalCoor(g,fcoor);
for(int d=0;d<nd;d++){
@@ -1080,11 +1307,80 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
});
}
//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
template<class VobjOut, class VobjIn>
void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
typedef typename VobjOut::vector_type Vout;
typedef typename VobjIn::vector_type Vin;
const int N = sizeof(VobjOut)/sizeof(Vout);
conformable(out.Grid(),in.Grid());
out.Checkerboard() = in.Checkerboard();
int nsimd = out.Grid()->Nsimd();
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(idx,out.Grid()->oSites(),1,{
Vout *vout = (Vout *)&out_v[idx];
Vin *vin = (Vin *)&in_v[idx];
precisionChange(vout,vin,N);
});
}
//Convert a Lattice from one precision to another (original, slow implementation)
template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
}
out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid();
GridBase *out_grid = out.Grid();
typedef typename VobjOut::scalar_object SobjOut;
typedef typename VobjIn::scalar_object SobjIn;
int ndim = out.Grid()->Nd();
int out_nsimd = out_grid->Nsimd();
int in_nsimd = in_grid->Nsimd();
std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){
out_icoor[lane].resize(ndim);
out_grid->iCoorFromIindex(out_icoor[lane], lane);
}
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
autoView( out_v , out, CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
ExtractPointerArray<SobjOut> ptrs(out_nsimd);
Coordinate lcoor(out_grid->Nd());
for(int lane=0; lane < out_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
ptrs[lane] = &in_slex_conv[llex];
}
merge(out_v[out_oidx], ptrs, 0);
});
}
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
//maintain grids for checking
GridBase* _out_grid;
GridBase* _in_grid;
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
@@ -1131,20 +1427,46 @@ public:
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
void checkGrids(GridBase* out, GridBase* in) const{
conformable(out, _out_grid);
conformable(in, _in_grid);
}
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
template<class VobjOut, class VobjIn>
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
if(out.Grid() == in.Grid()){
precisionChangeFast(out,in);
return 1;
}else{
return 0;
}
}
template<class VobjOut, class VobjIn>
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
return 0;
}
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
//which contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
if(_precisionChangeFastWrap(out,in,0)) return;
static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
workspace.checkGrids(out.Grid(),in.Grid());
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
@@ -1161,15 +1483,18 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const pre
});
}
//Convert a Lattice from one precision to another
//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
if(_precisionChangeFastWrap(out,in,0)) return;
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
precisionChange(out, in, workspace);
}
////////////////////////////////////////////////////////////////////////////////
// Communicate between grids
////////////////////////////////////////////////////////////////////////////////
@@ -1464,5 +1789,35 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
}
}
//////////////////////////////////////////////////////
// Faster but less accurate blockProject
//////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const VLattice &Basis)
{
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
Lattice<iScalar<CComplex> > ip(coarse);
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
RealD t_IP=0;
RealD t_co=0;
for(int v=0;v<nbasis;v++) {
t_IP-=usecond();
blockInnerProductD(ip,Basis[v],fineData);
t_IP+=usecond();
t_co-=usecond();
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
t_co+=usecond();
}
}
NAMESPACE_END(Grid);

View File

@@ -45,6 +45,7 @@ public:
};
// Host only
GridBase * getGrid(void) const { return _grid; };
vobj* getHostPointer(void) const { return _odata; };
};
/////////////////////////////////////////////////////////////////////////////////////////

601
Grid/lattice/PaddedCell.h Normal file
View File

@@ -0,0 +1,601 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/PaddedCell.h
Copyright (C) 2019
Author: Peter Boyle pboyle@bnl.gov
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include<Grid/cshift/Cshift.h>
NAMESPACE_BEGIN(Grid);
//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
template<typename vobj>
struct CshiftImplBase{
virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
virtual ~CshiftImplBase(){}
};
template<typename vobj>
struct CshiftImplDefault: public CshiftImplBase<vobj>{
Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
};
template<typename Gimpl>
struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
};
/*
*
* TODO:
* -- address elementsof vobj via thread block in Scatter/Gather
* -- overlap comms with motion in Face_exchange
*
*/
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
Lattice<vobj> &lat,
int x,
int dim,
int offset=0)
{
const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout;
int Nd = grid->Nd();
int block = grid->_slice_block[dim];
int stride = grid->_slice_stride[dim];
int nblock = grid->_slice_nblock[dim];
int rd = grid->_rdimensions[dim];
int ox = x%rd;
int ix = x/rd;
int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal
assert(rNsimda==rNsimd);
int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
//for cross platform
// FIXME -- can put internal indices into thread loop
auto buf_p = & buf[0];
autoView(lat_v, lat, AcceleratorWrite);
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce
#ifdef GRID_SIMT
{
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int blane=0;blane<Nsimd;blane++) {
#endif
int olane=blane%rNsimd; // reduced lattice lane
int obit =blane/rNsimd;
///////////////////////////////////////////////////////////////
// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
///////////////////////////////////////////////////////////////
int ssp = ss*simd[dim]+obit;
int b = ssp%block;
int n = ssp/block;
int osite= b+n*stride + ox*block;
////////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate icoor;
int lane;
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
icoor[dim]=ix;
Lexicographic::IndexFromCoor(icoor,lane,simd);
///////////////////////////////////////////
// Transfer into lattice - will coalesce
///////////////////////////////////////////
// sobj obj = extractLane(blane,buf_p[ss+offset]);
// insertLane(lane,lat_v[osite],obj);
const int words=sizeof(vobj)/sizeof(vector_type);
vector_type * from = (vector_type *)&buf_p[ss+offset];
vector_type * to = (vector_type *)&lat_v[osite];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], blane);
putlane(to[w], stmp, lane);
}
}
});
}
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
const Lattice<vobj> &lat,
int x,
int dim,
int offset=0)
{
const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
autoView(lat_v, lat, AcceleratorRead);
GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout;
int Nd = grid->Nd();
int block = grid->_slice_block[dim];
int stride = grid->_slice_stride[dim];
int nblock = grid->_slice_nblock[dim];
int rd = grid->_rdimensions[dim];
int ox = x%rd;
int ix = x/rd;
int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
//for cross platform
//For CPU perhaps just run a loop over Nsimd
auto buf_p = & buf[0];
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce
#ifdef GRID_SIMT
{
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int blane=0;blane<Nsimd;blane++) {
#endif
int olane=blane%rNsimd; // reduced lattice lane
int obit =blane/rNsimd;
////////////////////////////////////////////
// osite
////////////////////////////////////////////
int ssp = ss*simd[dim]+obit;
int b = ssp%block;
int n = ssp/block;
int osite= b+n*stride + ox*block;
////////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate icoor;
int lane;
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
icoor[dim]=ix;
Lexicographic::IndexFromCoor(icoor,lane,simd);
///////////////////////////////////////////
// Take out of lattice
///////////////////////////////////////////
// sobj obj = extractLane(lane,lat_v[osite]);
// insertLane(blane,buf_p[ss+offset],obj);
const int words=sizeof(vobj)/sizeof(vector_type);
vector_type * to = (vector_type *)&buf_p[ss+offset];
vector_type * from = (vector_type *)&lat_v[osite];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], lane);
putlane(to[w], stmp, blane);
}
}
});
}
class PaddedCell {
public:
GridCartesian * unpadded_grid;
int dims;
int depth;
std::vector<GridCartesian *> grids;
~PaddedCell()
{
DeleteGrids();
}
PaddedCell(int _depth,GridCartesian *_grid)
{
unpadded_grid = _grid;
depth=_depth;
dims=_grid->Nd();
AllocateGrids();
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) assert(local[d]>=depth);
}
}
void DeleteGrids(void)
{
Coordinate processors=unpadded_grid->_processors;
for(int d=0;d<grids.size();d++){
if ( processors[d] > 1 ) {
delete grids[d];
}
}
grids.resize(0);
};
void AllocateGrids(void)
{
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate simd =unpadded_grid->_simd_layout;
Coordinate processors=unpadded_grid->_processors;
Coordinate plocal =unpadded_grid->LocalDimensions();
Coordinate global(dims);
GridCartesian *old_grid = unpadded_grid;
// expand up one dim at a time
for(int d=0;d<dims;d++){
if ( processors[d] > 1 ) {
plocal[d] += 2*depth;
for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d];
}
old_grid = new GridCartesian(global,simd,processors);
}
grids.push_back(old_grid);
}
};
template<class vobj>
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
{
Coordinate processors=unpadded_grid->_processors;
Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions();
// depends on the MPI spread
Coordinate fll(dims,depth);
Coordinate tll(dims,0); // depends on the MPI spread
for(int d=0;d<dims;d++){
if( processors[d]==1 ) fll[d]=0;
}
localCopyRegion(in,out,fll,tll,local);
return out;
}
template<class vobj>
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = Expand(d,tmp,cshift); // rvalue && assignment
}
return tmp;
}
template<class vobj>
inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = ExpandPeriodic(d,tmp); // rvalue && assignment
}
return tmp;
}
// expand up one dim at a time
template<class vobj>
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
double tins=0, tshift=0;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
// replace with a copy and maybe grid swizzle
// return in;??
double t = usecond();
padded = in;
tins += usecond() - t;
} else {
//////////////////////////////////////////////
// Replace sequence with
// ---------------------
// (i) Gather high face(s); start comms
// (ii) Gather low face(s); start comms
// (iii) Copy middle bit with localCopyRegion
// (iv) Complete high face(s), insert slice(s)
// (iv) Complete low face(s), insert slice(s)
//////////////////////////////////////////////
// Middle bit
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
tins += usecond() - t;
// High bit
t = usecond();
shifted = cshift.Cshift(in,dim,depth);
tshift += usecond() - t;
t=usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
tins += usecond() - t;
// Low bit
t = usecond();
shifted = cshift.Cshift(in,dim,-depth);
tshift += usecond() - t;
t = usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
tins += usecond() - t;
}
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
return padded;
}
template<class vobj>
inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in) const
{
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
// Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
// std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
padded=in; // slightly different interface could avoid a copy operation
} else {
Face_exchange(in,padded,dim,depth);
return padded;
}
return padded;
}
template<class vobj>
void Face_exchange(const Lattice<vobj> &from,
Lattice<vobj> &to,
int dimension,int depth) const
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::scalar_object sobj;
RealD t_gather=0.0;
RealD t_scatter=0.0;
RealD t_comms=0.0;
RealD t_copy=0.0;
// std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
// DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
GridBase *grid=from.Grid();
GridBase *new_grid=to.Grid();
Coordinate lds = from.Grid()->_ldimensions;
Coordinate nlds= to.Grid()->_ldimensions;
Coordinate simd= from.Grid()->_simd_layout;
int ld = lds[dimension];
int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd();
assert(depth<=lds[dimension]); // A must be on neighbouring node
assert(depth>0); // A caller bug if zero
assert(ld+2*depth==nld);
////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations
////////////////////////////////////////////////////////////////////////////
int buffer_size = 1;
for(int d=0;d<lds.size();d++){
if ( d!= dimension) buffer_size=buffer_size*lds[d];
}
buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static deviceVector<vobj> send_buf;
static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<MpiCommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size;
int bytes = words * sizeof(vobj);
////////////////////////////////////////////////////////////////////////////
// Communication coords
////////////////////////////////////////////////////////////////////////////
int comm_proc = 1;
int xmit_to_rank;
int recv_from_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
////////////////////////////////////////////////////////////////////////////
// Gather all surface terms up to depth "d"
////////////////////////////////////////////////////////////////////////////
RealD t;
RealD t_tot=-usecond();
int plane=0;
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+0;
t=usecond();
GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
t_gather+=usecond()-t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+1;
t=usecond();
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
t_gather+= usecond() - t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t;
}
////////////////////////////////////////////////////////////////////////////
// Copy interior -- overlap this with comms
////////////////////////////////////////////////////////////////////////////
int Nd = new_grid->Nd();
Coordinate LL(Nd,0);
Coordinate sz = grid->_ldimensions;
Coordinate toLL(Nd,0);
toLL[dimension]=depth;
t=usecond();
localCopyRegion(from,to,LL,toLL,sz);
t_copy= usecond() - t;
////////////////////////////////////////////////////////////////////////////
// Scatter all faces
////////////////////////////////////////////////////////////////////////////
plane=0;
t=usecond();
grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
}
t_scatter= usecond() - t;
t=usecond();
grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
}
t_scatter+= usecond() - t;
t_tot+=usecond();
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes :" << depth*bytes/1e6 << "MB"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -65,30 +65,38 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0);
GridLogError.Active(1);
GridLogWarning.Active(0);
GridLogMessage.Active(1); // at least the messages should be always on
GridLogMemory.Active(0);
GridLogTracing.Active(0);
GridLogIterative.Active(0);
GridLogDebug.Active(0);
GridLogPerformance.Active(0);
GridLogDslash.Active(0);
GridLogIntegrator.Active(1);
GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
if (logstreams[i] == std::string("Tracing")) GridLogTracing.Active(1);
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);

View File

@@ -138,7 +138,8 @@ public:
stream << std::setw(log.topWidth);
}
stream << log.topName << log.background()<< " : ";
stream << log.colour() << std::left;
// stream << log.colour() << std::left;
stream << std::left;
if (log.chanWidth > 0)
{
stream << std::setw(log.chanWidth);
@@ -153,9 +154,9 @@ public:
stream << log.evidence()
<< now << log.background() << " : " ;
}
stream << log.colour();
// stream << log.colour();
stream << std::right;
stream.flags(f);
return stream;
} else {
return devnull;
@@ -180,13 +181,51 @@ extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative;
extern GridLogger GridLogIntegrator;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
extern Colours GridLogColours;
std::string demangle(const char* name) ;
template<typename... Args>
inline std::string sjoin(Args&&... args) noexcept {
std::ostringstream msg;
(msg << ... << args);
return msg.str();
}
/*! @brief make log messages work like python print */
template <typename... Args>
inline void Grid_log(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << GridLogMessage << msg << std::endl;
}
/*! @brief make warning messages work like python print */
template <typename... Args>
inline void Grid_warn(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
}
/*! @brief make error messages work like python print */
template <typename... Args>
inline void Grid_error(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
}
/*! @brief make pass messages work like python print */
template <typename... Args>
inline void Grid_pass(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
}
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@@ -165,7 +165,7 @@ class BinaryIO {
* FIXME -- 128^3 x 256 x 16 will overflow.
*/
int global_site;
int64_t global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@@ -175,8 +175,8 @@ class BinaryIO {
Lexicographic::IndexFromCoor(coor,global_site,global_vol);
uint32_t gsite29 = global_site%29;
uint32_t gsite31 = global_site%31;
uint64_t gsite29 = global_site%29;
uint64_t gsite31 = global_site%31;
site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
// std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@@ -545,7 +545,9 @@ class BinaryIO {
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
uint32_t &scidac_csumb,
int control=BINARYIO_LEXICOGRAPHIC
)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
@@ -556,7 +558,7 @@ class BinaryIO {
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
nersc_csum,scidac_csuma,scidac_csumb);
GridStopWatch timer;
@@ -582,7 +584,8 @@ class BinaryIO {
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
uint32_t &scidac_csumb,
int control=BINARYIO_LEXICOGRAPHIC)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
@@ -607,7 +610,7 @@ class BinaryIO {
while (attemptsLeft >= 0)
{
grid->Barrier();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
nersc_csum,scidac_csuma,scidac_csumb);
if (checkWrite)
{
@@ -617,7 +620,7 @@ class BinaryIO {
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
grid->Barrier();
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
cknersc_csum,ckscidac_csuma,ckscidac_csumb);
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
{

View File

@@ -31,6 +31,7 @@ directory
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <map>
#include <pwd.h>
@@ -161,8 +162,14 @@ template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
{
uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
if ( scidac_csuma !=scidac_checksuma) return 0;
if ( scidac_csumb !=scidac_checksumb) return 0;
std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csuma<<" expected "<<scidac_checksuma <<std::endl;
std::cout << GridLogMessage << " scidacChecksumVerify computed "<<scidac_csumb<<" expected "<<scidac_checksumb <<std::endl;
if ( scidac_csuma !=scidac_checksuma) {
return 0;
};
if ( scidac_csumb !=scidac_checksumb) {
return 0;
};
return 1;
}
@@ -205,7 +212,7 @@ class GridLimeReader : public BinaryIO {
// Read a generic lattice field and verify checksum
////////////////////////////////////////////
template<class vobj>
void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
{
typedef typename vobj::scalar_object sobj;
scidacChecksum scidacChecksum_;
@@ -237,7 +244,7 @@ class GridLimeReader : public BinaryIO {
uint64_t offset= ftello(File);
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
/////////////////////////////////////////////
@@ -407,7 +414,7 @@ class GridLimeWriter : public BinaryIO
// in communicator used by the field.Grid()
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
{
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
@@ -458,7 +465,7 @@ class GridLimeWriter : public BinaryIO
///////////////////////////////////////////
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);
///////////////////////////////////////////
// Wind forward and close the record
@@ -511,7 +518,8 @@ class ScidacWriter : public GridLimeWriter {
////////////////////////////////////////////////
template <class vobj, class userRecord>
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0)
const unsigned int recordScientificPrec = 0,
int control=BINARYIO_LEXICOGRAPHIC)
{
GridBase * grid = field.Grid();
@@ -533,7 +541,7 @@ class ScidacWriter : public GridLimeWriter {
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
}
// Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control); // Closes message with checksum
}
};
@@ -552,7 +560,8 @@ class ScidacReader : public GridLimeReader {
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
int control=BINARYIO_LEXICOGRAPHIC)
{
typedef typename vobj::scalar_object sobj;
GridBase * grid = field.Grid();
@@ -570,7 +579,7 @@ class ScidacReader : public GridLimeReader {
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
}
void skipPastBinaryRecord(void) {
std::string rec_name(ILDG_BINARY_DATA);
@@ -654,7 +663,8 @@ class IldgWriter : public ScidacWriter {
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
ildgfmt.field = std::string("su3gauge");
const std::string stNC = std::to_string( Nc ) ;
ildgfmt.field = std::string("su"+stNC+"gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
@@ -871,7 +881,8 @@ class IldgReader : public GridLimeReader {
} else {
assert(found_ildgFormat);
assert ( ildgFormat_.field == std::string("su3gauge") );
const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
@@ -879,7 +890,7 @@ class IldgReader : public GridLimeReader {
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);

View File

@@ -6,8 +6,8 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
assert( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){
#if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
cm(mu)()(1,1) = adj(cm(mu)()(0,x)) ;
#else
const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#endif
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
@@ -317,8 +320,8 @@ template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
@@ -330,8 +333,8 @@ template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}

View File

@@ -9,6 +9,7 @@
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -30,6 +31,8 @@
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
#include <string>
NAMESPACE_BEGIN(Grid);
using namespace Grid;
@@ -147,15 +150,17 @@ public:
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
const int ieee32big = (format == std::string("IEEE32BIG"));
const int ieee32 = (format == std::string("IEEE32"));
const int ieee64big = (format == std::string("IEEE64BIG"));
const int ieee64 = (format == std::string("IEEE64") || \
format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
const std::string stNC = std::to_string( Nc ) ;
if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@@ -166,7 +171,7 @@ public:
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
} else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@@ -211,27 +216,29 @@ public:
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
writeConfiguration(Umu,file,0,1,ens_label);
writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
int two_row,
int bits32,
std::string ens_label = std::string("DWF"))
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = std::string("UKQCD");
header.sequence_number = sequence_number;
header.ensemble_id = ens_id;
header.ensemble_label = ens_label;
header.hdr_version = "1.0" ;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
@@ -245,10 +252,14 @@ public:
uint64_t offset;
// Sod it -- always write 3x3 double
// Sod it -- always write NcxNc double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
const std::string stNC = std::to_string( Nc ) ;
if( two_row ) {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
} else {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
}
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
@@ -256,8 +267,15 @@ public:
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
if( two_row ) {
Gauge3x2unmunger<fobj2D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleUnmunger<fobj3D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
}
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
@@ -290,7 +308,6 @@ public:
MachineCharacteristics(header);
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");

View File

@@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
NAMESPACE_BEGIN(Grid);
GridTimePoint theProgramStart = GridClock::now();
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {

View File

@@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
#ifndef __SSC_START
#define __SSC_START
#define __SSC_STOP
#endif
#include <sys/time.h>
#include <ctime>
#include <chrono>
@@ -72,17 +78,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
inline uint64_t cyclecount(void){
return 0;
}
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
#define __SSC_STOP __SSC_MARK(0x110)
#define __SSC_START __SSC_MARK(0x111)
#else
#define __SSC_MARK(mark)
#define __SSC_STOP
#define __SSC_START
/*
* cycle counters arch dependent
*/

View File

@@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid)
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
struct timeval tv;
#ifdef TIMERS_ON
gettimeofday(&tv,NULL);
#endif
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
typedef std::chrono::system_clock GridClock;
//typedef std::chrono::system_clock GridClock;
typedef std::chrono::high_resolution_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::seconds GridSecs;
@@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridUsecs;
typedef std::chrono::microseconds GridTime;
extern GridTimePoint theProgramStart;
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart);
return 1.0*usecs.count();
}
inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
{
stream << time.count()<<" s";

Some files were not shown because too many files have changed in this diff Show More