1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Compare commits

...

3917 Commits

Author SHA1 Message Date
88479961ba Merge 8729c46169 into ba9bbe0221 2025-02-13 11:12:47 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
066544281f Deprecate UVM 2024-09-17 13:34:27 +00:00
11be10d2c0 Aurora testing 2024-09-10 18:11:52 +00:00
160969a758 UVM tester, doesn't turn up anything 2024-09-10 18:09:42 +00:00
622f78ebea SYCL updates -- operator = giving trouble on Aurora.
SYCL reduction is failing intermittently with SVM interface - returns
zero, expect non-zero.
Think I need to remove ALL dependence on SVM.
2024-09-04 13:53:48 +00:00
aa67a5b095 Rename 2024-08-27 19:54:01 +00:00
af9ea0864c Blas fix 2024-08-27 19:53:09 +00:00
4e2a6d87c4 Gemm batched fix 2024-08-27 19:24:05 +00:00
a465ecece9 Aurora 2024-08-27 19:20:43 +00:00
575eb72182 Converges on 16^3 2024-08-27 19:20:38 +00:00
3a973914d6 Compile on frontier 2024-08-27 14:55:42 -04:00
f568c07bbd Improved the BLAS benchmark 2024-08-27 14:53:54 -04:00
2c9878fc3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-27 12:05:46 -04:00
27b1b1b005 Checkerboard available for offloading pickCheckerboard 2024-08-27 12:04:09 -04:00
130d7ab077 Verbose changes 2024-08-27 12:03:28 -04:00
29f6b8a74a Setup 2024-08-27 12:02:49 -04:00
9779aaea33 16^3 optimise 2024-08-27 11:38:35 -04:00
ec25604a67 Fastest solver for mrhs multigrid 2024-08-27 11:32:34 -04:00
3668e81c5e Extract slice working on checkerboard field for Block Lanczos 2024-08-27 11:31:30 -04:00
d66b2423cb Move slice operations to GPU for BlockCG 2024-08-27 11:28:47 -04:00
15cc78f0b6 peek/poke local site on checkerboard arrays 2024-08-27 11:23:42 -04:00
06db4ddea2 Fast init on GPU 2024-08-27 11:22:33 -04:00
6cfb90e99f Support needed for accelerator resident set/pick Checkerboard 2024-08-27 11:19:00 -04:00
d8be95a2a3 Don't early terminate power method to get more accurate top EV 2024-08-27 11:17:37 -04:00
f82702872d Normal residual 2024-08-27 11:16:44 -04:00
3752c49ef0 Add option to record the CG polynomial 2024-08-27 11:14:35 -04:00
fe65fa4988 MulMatrix 2024-08-27 11:13:18 -04:00
1fe4c205a3 Adef 2024-08-27 11:11:47 -04:00
d4dc5e0f43 BlockCG linalg acceleratoin with BLAS 2024-08-27 11:08:33 -04:00
77944437ce Functor initialisation 2024-08-27 11:01:02 -04:00
c164bff758 MMdag 2024-08-27 11:00:36 -04:00
aa2e3d954a MMdag operator 2024-08-27 10:59:29 -04:00
de62b04728 Block CG linalg acceleration 2024-08-27 10:58:54 -04:00
d0bdb50f24 Analyse power spectrum 2024-08-27 10:58:19 -04:00
a8fecbc609 BlockCG linalg via BLAS 2024-08-21 16:08:16 -04:00
557fa483ff Blas benchmark committed stand alone 2024-08-20 16:18:43 +00:00
fc15d55df6 Mallinfo 2024-08-20 14:33:09 +00:00
53573d7d94 Better benchmark 2024-08-20 14:31:57 +00:00
bb3c177000 Better benchmarking 2024-08-20 14:31:41 +00:00
a3322b470f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-20 14:30:52 +00:00
f8f408e7a9 BLAS everywhere 2024-07-25 18:09:02 +00:00
baac1127d0 Later intel compiler happiness 2024-07-25 18:06:05 +00:00
6f1328160c Remove SVM use 2024-07-25 18:05:40 +00:00
04cf902791 Mallinfo and ASAN hooks 2024-07-25 18:04:56 +00:00
7a5b1c1a19 Try Catch convenience macro 2024-07-25 18:03:41 +00:00
18d2d7da4a Eigen implementation and SYCL implementation 2024-07-25 18:02:56 +00:00
b461184797 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-23 09:53:58 -04:00
4563b39305 New Frontier config 2024-07-23 09:53:08 -04:00
c9d5674d5b FInal for paper 2024-07-22 15:26:45 -04:00
486412635a 8^4 test for PETSc 2024-07-22 15:25:17 -04:00
8b23a1546a Force compile temporarily 2024-07-22 15:24:56 -04:00
a901e4e369 Regressed performance for paper 2024-07-22 15:24:04 -04:00
804d9367d4 Regressed performance 2024-07-22 15:23:25 -04:00
41d8adca95 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-11 15:38:45 +00:00
059e8e5bb0 New compile option 2024-07-11 15:37:30 +00:00
b3ee8ded96 Respect command line 2024-07-11 15:34:48 +00:00
cf3584ad15 Convenient to monitor memory across an HMC trajectory 2024-07-11 15:30:32 +00:00
a66973163f Device vector not UVM 2024-07-11 15:24:11 +00:00
4502a8c8a1 libc malloc heap info dump on Linux 2024-07-11 15:22:18 +00:00
9c902e4c2d Batched blas, but not working yet on OneAPI 2024-07-11 15:19:49 +00:00
f3eb36adcf Namespace addition 2024-07-11 15:19:19 +00:00
7c246606c1 Schur additional case 2024-07-10 22:04:32 +00:00
172c75029e Redblack additional case 2024-07-10 22:03:59 +00:00
6ae52da571 LLVM leak sanitizer 2024-07-08 15:59:18 +00:00
4ee9c68053 Updated compile environment 2024-07-08 15:57:57 +00:00
a15b4378a3 Sanitizer preservation of options 2024-07-08 15:57:45 +00:00
89fdd7f8dd AOT compilation 2024-07-05 17:47:56 +00:00
c328be24b7 Sanitizer compile options 2024-07-05 17:46:43 +00:00
a73dc6dbf4 Display linux heap info 2024-06-28 16:05:17 +00:00
eee2a2657f Try catch exception wrappers 2024-06-28 16:02:29 +00:00
12b8be7cb9 Best so far on 96^3 350 Evecs converged on 4^4 block 2024-06-18 16:31:37 -04:00
63c223ea5d Verbose 2024-06-18 03:22:01 +00:00
2877fb4a2c More verbose if alloc failure 2024-06-18 03:21:03 +00:00
d299c86633 Std::asin,acos 2024-06-11 16:41:23 -04:00
6ce52092e8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 15:16:58 -04:00
b5926c1d21 Broadcast time info 2024-06-11 15:16:25 -04:00
9563238e9b Force initial to identity 2024-06-11 17:51:58 +00:00
fb9b1d76ca Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 16:48:16 +00:00
1739146599 Property to initialise reduction 2024-06-11 16:47:35 +00:00
ed20b39ab3 Log files from Frontier benchmark 2024-06-11 11:16:20 -04:00
284fc05f15 Protect vs. missing LIME libarary 2024-06-11 11:08:00 -04:00
07a07b6fa3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-10 15:09:25 -04:00
dc80b08969 96^3 test 2024-06-10 15:07:29 -04:00
a49a161f8d SYCL update to use buffer on reduction variable 2024-06-08 16:05:18 +00:00
a6479ca50f Shuhei's ComputeWilsonFlow main programme 2024-06-05 15:51:11 -04:00
0e607a55e7 Updated for 8^4 test 2024-05-26 20:53:05 +00:00
c4b9f71357 CPU compile ordering is important 2024-05-21 02:22:32 +01:00
394e506aea Compile options for tursa update 2024-05-21 02:10:04 +01:00
e19b26341b Tursa configure update 2024-05-21 01:14:27 +01:00
cfe1b13225 Back out zero change 2024-05-21 01:14:08 +01:00
890c5ea1cd Warning disable 2024-05-20 20:08:31 +01:00
a87378d3b6 Update 2024-05-20 20:08:31 +01:00
832fc08809 Merge pull request #459 from dbollweg/sycl_slicesum_update
Sycl slicesum bugfix
2024-05-20 15:06:53 -04:00
9a1ad6a5eb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-05-17 11:33:46 -04:00
a90eafad24 Merge branch 'feature/scidac-wp1' into develop 2024-05-17 11:32:00 -04:00
ad14a82742 Working aas good as possible on 48^3 in double 2024-05-16 10:55:45 -04:00
14e9d8ed9f CG improvements for smoother 2024-05-16 10:55:18 -04:00
0ac85fa70b Serialisation removal 2024-05-16 10:49:04 -04:00
c371de42b9 Some site tools for sitewise autocorr 2024-05-16 10:48:23 -04:00
ccf147d6c1 Select the compiler that gives better performance on sunspot 2024-05-07 18:45:56 +00:00
7aa12b446f New config command for sunspot 2024-05-07 18:45:40 +00:00
c293228102 layout control 2024-05-07 18:45:21 +00:00
5c4c9f721a Remove pbs file and replace with bench1 and bench2 for 1 and 2 nodes 2024-05-07 18:44:49 +00:00
057f86c1de 2 queues works ok in performance 2024-05-07 18:42:50 +00:00
cd52e3cbc2 Jobs on subspot 2024-05-07 18:38:15 +00:00
24602e1259 Accidental synchronise 2024-05-07 17:28:38 +00:00
8a098889fc Update FlightRecorder.cc 2024-04-30 21:15:08 +01:00
5c3ace7c3e Merge branch 'develop' into feature/scidac-wp1 2024-04-30 05:26:06 -04:00
aa148455b7 Updated todo list 2024-04-30 05:24:39 -04:00
98cf247f33 prepare to switch to mixed precision 2024-04-30 05:23:45 -04:00
0cf16522d1 Refine with HDCG choice 2024-04-30 05:22:14 -04:00
7b7c75f9e5 Setup 2024-04-30 05:21:02 -04:00
aefd255a3c Verbose 2024-04-30 05:20:41 -04:00
1c5aa939fd Subspace setup changes 2024-04-30 05:19:09 -04:00
3a0ff17be0 Verbose changes 2024-04-30 05:17:28 -04:00
47829ae5cc Verbose changes 2024-04-30 05:16:46 -04:00
bfa7b69aff Verbose changes 2024-04-16 15:42:46 -04:00
2aaa959b5f Printing changes 2024-04-16 15:41:25 -04:00
ce2970b93a Printing changes 2024-04-16 15:40:38 -04:00
7b76970d10 Verbose changes 2024-04-16 15:40:10 -04:00
9fd41882d2 Herm Op update 2024-04-16 15:39:27 -04:00
ff2ea5de18 Update Tensor_traits.h 2024-04-11 14:25:45 -04:00
5147a42818 Updated hdcg 2024-04-05 01:05:57 -04:00
57552d8ca3 Assign from non-lattice made accelerator resident 2024-04-05 01:05:12 -04:00
13713b2a76 Much faster little dirac operator calculation 2024-04-05 01:04:40 -04:00
36a14e4ee3 Best setup and introduce an HDCG refine method 2024-04-05 01:03:33 -04:00
b4cc788b8c First version used in mrhsHDCG
Need to consolidate files.
Plan: Make this version able to go virtual base, then absorb chulwoos
version when it is proven
2024-04-05 01:02:21 -04:00
0f0e7512f3 Keep MRHS in a different file 2024-04-05 00:59:53 -04:00
1196b1a161 Less verbose 2024-04-05 00:58:58 -04:00
2c8c3be9ee Adef2Mrhs 2024-04-05 00:57:13 -04:00
5b79d51c22 Improvements 2024-04-01 14:18:40 -04:00
da890dc293 Verbose changes 2024-04-01 14:18:00 -04:00
93d0a1e73a HISQ view call 2024-04-01 14:16:47 -04:00
f0a8c7d045 Playing with chebyshevs 2024-04-01 14:16:11 -04:00
db8793777c Logging/verbose 2024-04-01 14:15:41 -04:00
c745484e65 9.5x speed up version 2024-04-01 14:14:30 -04:00
da59379612 Large reg file for double 2024-03-26 17:03:20 +00:00
3ef2a41518 ifdef guard ommitted 2024-03-26 14:50:32 +00:00
aa96f420c6 Acclerator ware MPI guard on the Unix domain sockets 2024-03-26 14:41:25 +00:00
49e9e4ed0e Fences 2024-03-26 14:14:06 +00:00
f7b8163016 Deterministic MPI reduce options 2024-03-26 14:11:40 +00:00
93769eacd3 Updated configure for bounce through host 2024-03-26 14:10:24 +00:00
59b0cc11df REduce the time in single 2024-03-26 00:42:40 +00:00
f32c275376 Updated config options for MPI not being aware of GPU 2024-03-26 00:42:00 +00:00
5404fc66ab Merge needs a fence on SYCL 2024-03-26 00:38:41 +00:00
1f53458af8 Options to bounce through a host buffer if
--disable-accelerator-aware-mpi
2024-03-26 00:37:19 +00:00
434c3e7f1d We have a choice of GET or PUT across NVlink 2024-03-25 14:32:44 +00:00
500b119f3d Deterministic MPI 2024-03-22 15:55:23 +00:00
4b87259c1b New config command for sunspot 2024-03-22 15:43:49 +00:00
503dec34ef This appears working now on Sunspot 2024-03-22 15:43:30 +00:00
d1e9fe50d2 Xor csum for repro testing 2024-03-22 15:42:57 +00:00
d01e5fa838 Improved FlightRecorder 2024-03-22 15:42:32 +00:00
a477c25e8c Sunspot repro tests 2024-03-22 15:42:11 +00:00
1bd20cd9e8 FlightRecorder 2024-03-22 15:40:01 +00:00
e49e95b037 Upgrade of the Britney test with flight recorder and fast xor checksum 2024-03-22 15:39:27 +00:00
6f59fed563 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:32 +00:00
60b7f6c99d Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:26 +00:00
b92dfcc8d3 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:27 +00:00
f6fd6dd053 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:01 +00:00
79ad567dd5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-19 15:43:42 +00:00
fab1efb48c More britney logging improvements 2024-03-19 14:36:21 +00:00
660eb76d93 FFTW from OneAPI 2024-03-19 14:28:33 +00:00
461cd045c6 sliceSum cleanup 2024-03-13 18:18:44 -04:00
fee65d7a75 Merge branch 'paboyle:develop' into sycl_slicesum_update 2024-03-13 18:06:17 -04:00
31f9971dbf avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum 2024-03-13 13:39:26 -04:00
62e7bf024a Updated flight logging for Britney test 2024-03-12 20:10:04 +00:00
95f3d69cf9 Extra hardware test hook 2024-03-12 20:09:37 +00:00
89c0519f83 Repro test 2024-03-12 16:11:33 +00:00
2704b82084 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-12 15:16:24 +00:00
cf8632bbac Britney test option 2024-03-12 15:15:35 +00:00
d224297972 PBS scripts 2024-03-12 15:15:16 +00:00
a4d11a630f Merge pull request #458 from paboyle/fix/HOST_NAME_MAX
fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined
2024-03-07 07:50:25 -05:00
2b4399f8b1 more HOST_NAME_MAX fix 2024-03-07 15:26:01 +09:00
f17b8de907 fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined 2024-03-07 15:22:08 +09:00
d87296f3e8 Merge branch 'develop' of https://github.com/dbollweg/Grid into develop 2024-03-06 16:54:22 -05:00
be94cf1c6f Fewer wait-calls in sycl slicesum 2024-03-06 16:53:13 -05:00
cc04dc42dc Merge branch 'develop' into feature/scidac-wp1 2024-03-06 14:55:21 -05:00
070b61f08f Simplifying the MultiRHS solver to make it do SRHS *and* MRHS 2024-03-06 14:04:33 -05:00
7e5bd46dd3 Booster update 2024-03-06 19:03:45 +01:00
228bbb9d81 Benchmark results 2024-03-06 19:03:35 +01:00
b812a7b4c6 Staggered launch script 2024-03-06 01:32:40 +00:00
891a366f73 Repro CG script 2024-03-06 01:22:55 +00:00
10116b3be8 Force device copyable and tell SYCL to shut it. 2024-03-06 01:13:27 +00:00
a46a0f0882 force device copyable and don't take crap from SYCL 2024-03-06 01:12:49 +00:00
a26a8a38f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-06 00:05:00 +00:00
7435315d50 More blasted shell variables 2024-03-06 00:03:59 +00:00
9b5f741e85 Reproducing CG can be more useful now 2024-03-06 00:03:16 +00:00
517822fdd2 SPR HBM benchmarking right and also PVC batched GEMM 2024-03-06 00:02:27 +00:00
1b93a9be88 Print out the hostname 2024-03-06 00:01:58 +00:00
783a66b348 Deterministic reduction please 2024-03-06 00:01:37 +00:00
976c3e9b59 Hack for flight logging CG inner products.
Can be made to work, but could put in some more serious infrastructure
for repro testing and blame attribution (Britney test) if necessary
2024-03-05 23:59:57 +00:00
f8ca971dae Use of a bare PRECISION macro is not namespace safe and collides with
SYCL
2024-03-05 23:59:13 +00:00
21bc8c24df OneMKL batched blas starting 2024-03-05 23:58:20 +00:00
30228214f7 SYCL conflict with Eigen 2024-03-05 23:56:10 +00:00
2ae980ae43 Update sourceme.sh 2024-03-05 13:39:18 -05:00
6153dec2e4 Update setup.sh 2024-03-05 13:38:32 -05:00
c805f86343 USQCD benchmark 2024-03-01 00:05:04 -05:00
04ca065281 Only one rank opens 2024-02-29 20:09:11 -05:00
88d8fa43d7 Benchmark development 2024-02-29 20:01:44 -05:00
3c49762875 Propagate in the blas routine 2024-02-29 15:33:06 -05:00
436bf1d9d3 Merge pull request #455 from clarkedavida/hisq_fat_links
Hisq fat links
2024-02-29 15:29:39 -05:00
f70df6e195 changed NO_SHIFT and BACKWARD_CONST from define to enum 2024-02-29 12:29:30 -07:00
fce3852dff Merge pull request #451 from paboyle/feature/eigen-3.4.0-update
updating Eigen to 3.4.0
2024-02-28 18:03:37 -05:00
ee1b8bbdbd Merge pull request #454 from edbennett/adjoint-broke
fix HMC for non-fundamental representations
2024-02-28 14:05:27 -05:00
3f1636637d Merge pull request #453 from dbollweg/feature/sliceSum_gpu
Feature/slice sum gpu
2024-02-28 14:04:43 -05:00
2e570f5300 Merge pull request #457 from lehner/feature/gpt
Import GPT-related updates
2024-02-28 13:59:04 -05:00
9f89486df5 remove unnecessary code path 2024-02-28 19:56:23 +01:00
22b43b86cb Make GPT test suite work with SYCL 2024-02-28 12:57:17 +01:00
3c9012676a CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case. 2024-02-27 12:41:45 -05:00
ee3b3c4c56 relocate deflation support 2024-02-27 11:52:23 -05:00
462d706a63 Move to a blas directory 2024-02-27 11:51:04 -05:00
ee0d460c8e Blas based block project & deflate for multiRHS 2024-02-27 11:41:44 -05:00
cd15abe9d1 Mrhs prep 2024-02-27 11:41:13 -05:00
9f40467e24 Warning squash 2024-02-27 11:40:36 -05:00
d0b6593823 More verbose on checksum 2024-02-27 11:40:14 -05:00
79fc821d8d reorg headers 2024-02-27 11:39:37 -05:00
d7fdb9a7e6 Reorg headers 2024-02-27 11:39:06 -05:00
b74de51c18 Reorder headers 2024-02-27 11:38:52 -05:00
b507fe209c Added SpinColourMatrix case to sliceSum Test 2024-02-27 11:28:32 -05:00
6cd2d8fcd5 Replace cuda/hip memcpy with Grid functions 2024-02-26 09:55:07 -05:00
b02d022993 fixed race condition (thx michael) 2024-02-23 17:14:28 -07:00
94581e3c7a accelerator_for is broken 2024-02-23 15:58:33 -07:00
88b52cc045 Merge branch 'develop' into hisq_fat_links 2024-02-23 14:47:15 -07:00
0a816b5509 Merge branch 'feature/sliceSum_gpu' of https://github.com/dbollweg/Grid into feature/sliceSum_gpu 2024-02-22 21:43:06 -05:00
1c8b807c2e free malloc'd memory 2024-02-22 21:42:44 -05:00
44b466e072 Make InsertSliceFast the default at some point in future.
Should I do this now?
2024-02-21 14:51:24 -05:00
5e5b471bb2 Put/Get and DEviceToDevice 2024-02-21 14:47:06 -05:00
9c2565f64e Working and faster version 2024-02-21 14:46:43 -05:00
e1d0a7cec3 Batched blas 2024-02-21 14:38:20 -05:00
b19ae8f465 Nbasis method for convenience 2024-02-21 14:36:19 -05:00
cdff2c8e18 Updated mrhs adef 2024-02-21 14:27:19 -05:00
66391f84f2 Merge branch 'feature/gpt' of ../Grid into develop 2024-02-21 19:05:00 +01:00
97f7a9ecb3 fix HMC for non-fundamental representations 2024-02-21 08:27:55 +00:00
15878f7613 sliceSumReduction_cub_large now also faster than CPU on Frontier 2024-02-16 13:55:21 -05:00
e0d5e3c6c7 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-16 13:16:37 -05:00
6f3455900e Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs 2024-02-16 13:15:02 -05:00
56827d6ad6 accelerator_inline bug 2024-02-14 13:56:57 -07:00
73c0b29535 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-02-13 20:19:32 +00:00
303b83cdb8 Scaling benchmarks, verbosity and MPICH aware in acceleratorInit()
For some reason Dirichlet benchmark fails on several nodes; need to
debug this.
2024-02-13 19:48:03 +00:00
5ef4da3f29 Silence verbose 2024-02-13 19:47:36 +00:00
1502860004 Benchmark scripts 2024-02-13 19:47:02 +00:00
585efc6f3f More benchmark scripts 2024-02-13 19:40:49 +00:00
62055e04dd missing semicolon generates error with some compilers 2024-02-13 18:18:27 +01:00
e4a641b64e removing old Eigen tensor patch 2024-02-13 10:37:14 +01:00
8849f187f1 updating Eigen to 3.4.0 2024-02-13 10:30:22 +01:00
db420525b3 fix Simd::Nsimd typo 2024-02-12 15:03:53 -07:00
b5659d106e more test cases 2024-02-09 13:37:14 -05:00
4b43307402 Undo include path changes for level zero api header 2024-02-09 13:07:56 -05:00
09af8c25a2 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-09 13:02:59 -05:00
9514035b87 refactor slicesum: slicesum uses GPU version by default now 2024-02-09 13:02:28 -05:00
2da09ae99b acceleration compiles and doesn't break scalar mode 2024-02-06 18:40:13 -07:00
a38fb0e04a first effort toward accelerators 2024-02-06 18:24:55 -07:00
7019916294 RNG seed change safer for large volumes; this is a long term solution 2024-02-07 00:56:39 +00:00
1514b4f137 slicesum_sycl passes test 2024-02-06 19:08:44 -05:00
91cf5ee312 Updated bench script 2024-02-06 23:45:10 +00:00
0a6e2f42c5 small amount of cleanup 2024-02-06 16:32:07 -07:00
ab2de131bd work towards sliceSum for sycl backend 2024-02-06 13:24:45 -05:00
5bfa88be85 Aurora MPI standalone benchmake and options that work well 2024-02-06 16:28:40 +00:00
5af8da76d7 Fix cuda compilation of Lattice_slicesum_gpu.h 2024-02-01 18:02:30 -05:00
b8b9dc952d Async memcpy's and cleanup 2024-02-01 17:55:35 -05:00
79a6ed32d8 Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies 2024-02-01 16:41:03 -05:00
caa5f97723 Add sliceSum gpu using cub/hipcub 2024-01-31 16:50:06 -05:00
4924b3209e projectU3 yields a unitary matrix 2024-01-23 14:43:58 -07:00
eb702f581b Running on 12 rhs on 18 nodes of frontier 2024-01-22 17:44:15 -05:00
3d13fd56c5 Precompute phases, save memory in hermitian 2024-01-22 17:43:35 -05:00
6f51b49ef8 Use stderr 2024-01-22 17:41:09 -05:00
addc638856 Fast localCopyRegion, blockProjectFast 2024-01-22 17:40:38 -05:00
00f24f8765 already found some bugs in projection, still needs testing 2024-01-22 05:50:16 -07:00
f5b3d582b0 first attempt at U3 projection 2024-01-22 02:49:40 -07:00
981c93d67a update Test_fatLinks to accept Naik 2024-01-21 21:09:19 -07:00
c020b78e02 Merge branch 'develop' into hisq_fat_links 2024-01-21 20:21:08 -07:00
42ae36bc28 WOrking 2024-01-17 16:39:14 -05:00
c69f73ff9f Working 2024-01-17 16:38:46 -05:00
ca5ae8a2e6 Revert to working. 2024-01-17 16:32:05 -05:00
d967eb53de Working for first time 2024-01-17 16:31:12 -05:00
839f9f1bbe Don't log memory by default 2024-01-17 16:25:50 -05:00
b754a152c6 Flag guard correctly 2024-01-17 16:25:28 -05:00
e07cb2b9de Accelerator memory 2024-01-17 16:24:31 -05:00
a1f8bbb078 accelerator memory print 2024-01-17 16:24:09 -05:00
7909683f3b MultiRHS 2024-01-17 16:21:07 -05:00
25f71913b7 MultiRHS coarse 2024-01-04 12:01:17 -05:00
34ddd2b7b1 MultiRHS coarse space 2024-01-04 12:00:53 -05:00
d5fd90b2f3 Add 48^3 rtest 2024-01-04 12:00:01 -05:00
b7c7000d0d Don't need the numerical rounding tolerance in multigrid 2023-12-22 18:10:23 -05:00
551f6c4edd Synchronise changes 2023-12-22 18:09:11 -05:00
defd814750 Speed up the coarsened matrix matrix evaluation.
It is block project limited.
Could be sped up with calls to Batched GEMM and a data layout change.
2023-12-22 18:07:03 -05:00
3d517bbd2a Synchronise decouple from the launch
Speeds up multileg stencils
2023-12-22 18:06:13 -05:00
78ab955fec Better padded cell exchange 2023-12-22 18:05:41 -05:00
dd13937bb6 Better opt face gather scatter 2023-12-22 18:03:38 -05:00
66a1b63aa9 Faster grid/blas layout change.
Halo exchange is now the only slow part.
Revisit
2023-12-21 20:50:18 -05:00
22c611bd1a Delete temp file 2023-12-21 18:32:31 -05:00
c9bb1bf8ea Passing new BLAs based 2023-12-21 18:31:17 -05:00
2a0d75bac2 Aurora files 2023-12-21 23:20:17 +00:00
9e489887cf General coarse multiRHS move to BLAS implementation 2023-12-21 15:24:48 -05:00
9feb801bb9 Much simpler GPU implementation 2023-12-21 15:24:06 -05:00
c00b495933 Multigrid 2023-12-21 15:23:31 -05:00
d22eebe553 BLas options 2023-12-21 15:23:03 -05:00
8bcbd82680 BLAS based layout and implementation 2023-12-21 15:21:24 -05:00
dfa617c439 Batched SGEMM/DGEMM/ZGEMM/CGEMM
Hip, Cuda version and vanilla CPU
One MKL stub in comments, to be tested as different.
2023-12-21 14:01:18 -05:00
48d1f0df89 Optimised partially, working 2023-12-21 12:33:47 -05:00
b75cb7a12c Blas batched partial implementation on Frontier only for now 2023-12-21 12:31:33 -05:00
332563e037 Debugged, reducing verbose 2023-12-21 12:30:57 -05:00
0cce97a4fe verbosity only 2023-12-20 21:30:10 -05:00
95a8e4be64 rocblas 2023-12-20 21:27:59 -05:00
abcd6b8cb6 Faster version 2023-12-19 15:17:46 -05:00
e8f21c9b6d Memmory verbose control improvement 2023-12-19 15:16:58 -05:00
f48298ad4e Bug fix 2023-12-11 20:57:02 -05:00
645e47c1ba Config for Ampere Altra ARM 2023-12-08 16:17:56 -05:00
d1d9827263 Integrator logging update 2023-12-08 12:14:00 -05:00
e054078b11 Verbose 2023-12-05 16:15:17 -05:00
14643c0aab SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) 2023-12-04 15:45:57 -05:00
b77a9b8947 SDDC compiles starting 2023-11-30 14:31:51 -05:00
6835a7f208 Better logging, test on 81 point stencil 2023-11-29 19:20:47 -05:00
f59993b979 Nbasis§ 2023-11-29 09:47:36 -05:00
2290b8f680 Verbose 2023-11-29 09:47:04 -05:00
2c54be651c Further updates 2023-11-29 09:43:29 -05:00
e859a199df Reduce volume to interior for coarse stencil -- worth up to 4x gain 2023-11-28 10:23:16 -05:00
0a3682ad0b MultiRHS work 2023-11-28 07:43:37 -05:00
59abaeb5cd Time stamp 2023-11-24 12:56:45 -05:00
3e448435d3 Restrict to interior 2023-11-23 18:23:29 -05:00
a294bc3c5b Relax constraints for multiRHS 2023-11-23 18:20:42 -05:00
b302ad3d49 multiRHS test in place, passes Yay! 2023-11-23 18:20:15 -05:00
82fc4b1e94 Finalise 2023-11-23 18:19:41 -05:00
b4f1740380 Finalise message 2023-11-23 18:19:16 -05:00
031f85247c multRHS initial support -- needs optimisation for multi project/promote.
Bug fix in freeing intermediate grids to stop double free
2023-11-23 18:18:35 -05:00
639cc6f73a better support for multiRHS coarse space
Still to add restriction of domain of last loop to interior of padded cell (expect about 4.5x on test volume on Crusher)
2023-11-23 18:16:26 -05:00
09946cf1ba Improved, works on 48^3 moving to multiRHS optimisations 2023-11-15 18:03:05 -05:00
f4fa95e7cb Use 5.3.0 2023-11-15 18:01:38 -05:00
100e29e35e Allow expression as argument to norm2 2023-11-15 18:00:44 -05:00
4cbe471a83 devVector 2023-11-15 18:00:07 -05:00
8bece1f861 Faster to transpose the matrix and apply with column major order 2023-11-15 17:58:38 -05:00
a3ca71ec01 Lots more setup options, still working on them 2023-11-15 17:58:04 -05:00
e0543e8af5 Implement flexible preconditioned CG 2023-11-15 17:57:39 -05:00
c1eb80d01a Print which have converged 2023-11-15 17:57:08 -05:00
a26121d97b Better printing 2023-11-15 17:56:45 -05:00
043031a757 Report resid on failed convergence 2023-11-15 17:56:22 -05:00
807aeebe4c Resize tol in constructor 2023-11-15 17:55:57 -05:00
8aa1a37aad For Mirs preconditioner solver 2023-11-15 17:55:32 -05:00
7d077fe493 Frontier compiel 2023-11-09 13:58:44 -05:00
9cd4128833 fix naik bug 2023-11-03 14:11:38 -06:00
c8b17c9526 Naik to CShift 2023-11-02 12:43:22 -06:00
2ae2a81e85 attempt to fix Naik 2023-10-31 13:54:55 -06:00
69c869d345 fixed stupid typo 2023-10-30 17:41:52 -06:00
df9b958c40 naik now returns separately 2023-10-30 17:40:53 -06:00
3d3376d1a3 LePage works, trying Naik 2023-10-27 16:26:31 -06:00
4efa042f50 C++17 change 2023-10-24 10:57:50 -04:00
c7cb37e970 c++17 accepted 2023-10-24 10:57:24 -04:00
d34b207eab Avoid HIP warnings 2023-10-24 10:57:04 -04:00
0e6fa6f6b8 DOn't need the Cshift for the period optimisation 2023-10-24 10:56:31 -04:00
38b87de53f This works around a stacksize limit on AMD GPU 2023-10-24 10:56:07 -04:00
aa5047a9e4 Faster blockProject blockPromote 2023-10-24 10:49:55 -04:00
24b6ee0df9 M4 file 2023-10-24 10:36:48 -04:00
1e79cc9cbe Avoid compiler error 2023-10-24 10:36:09 -04:00
b3925df9c3 Verbose on CPU-GPU xfer, remove performance by default 2023-10-24 10:25:01 -04:00
f2648e94b9 getHostPointer added to Lattice 2023-10-23 13:47:41 +02:00
351795ac3a Better messaging 2023-10-20 19:33:04 -04:00
9c9c42d0df Tests on frontier with real speed up . 3.5x on 16^3 at mq=0.01 2023-10-20 19:27:13 -04:00
b6ad1bafc7 Normal memory SendToRecvFrom asynchronous for use in general stencil
code
2023-10-20 19:27:13 -04:00
a5ca40f446 Better verbose -- track CPU GPU motion under --log Memory, others go to
debug output stream
2023-10-20 19:27:13 -04:00
9ab54c5565 Overlap comms & data copy/buffer assembly in Ghost zone exchange 2023-10-20 19:27:13 -04:00
4341d96bde Massively sped up coarse grid mult, comms
Save 3ms spend (60% of time !) on cudaMalloc !!
2023-10-20 19:27:13 -04:00
5fac47a26d Faster halo exchange 2023-10-20 19:27:13 -04:00
e064f17346 Faster halo exchange 2023-10-20 19:27:13 -04:00
afe10ba2a2 More digits 2023-10-20 19:27:13 -04:00
7cc3435ba8 Imporved General coarsened matrix 2023-10-20 19:27:13 -04:00
541772313c Verbosity 2023-10-20 19:27:13 -04:00
3747494a09 Notify delet public 2023-10-20 19:27:13 -04:00
f2b98d0dcc Const safety 2023-10-20 19:27:13 -04:00
80471bf762 Alternate implementation involving face operations 2023-10-20 19:27:13 -04:00
a06f63c110 Improved I/O and non-lexico option exposed to SciDAC format 2023-10-20 19:27:13 -04:00
0ae4478cd9 Checkpoint the subspace and ldop 2023-10-20 19:27:13 -04:00
ae4e705e09 Use random vec as easier for debug 2023-10-20 19:27:13 -04:00
f5dcea9dbf Updates for Frontier 2023-10-20 19:27:12 -04:00
21ed6ac0f4 added floating-point support 2023-10-20 13:54:26 -06:00
7bb8ab7000 improve smearing templating 2023-10-20 08:41:02 -06:00
2c824c2641 Merge branch 'develop' into hisq_fat_links 2023-10-17 16:03:59 -06:00
391fd9cc6a try lepage term 2023-10-17 14:57:15 -06:00
2207309f8a Spack rules 2023-10-16 18:38:24 -04:00
51051df62c 3GeV run setup 2023-10-16 20:49:52 +03:00
33097681b9 FTHMC compiled and merged to develop 2023-10-14 00:42:55 +03:00
07e4900218 FTHMC commit 2023-10-13 18:21:57 +03:00
36ab567d67 FTHMC 3 Gev 2023-10-13 18:21:57 +03:00
e19171523b FTHMC Status at lattice conference commit 2023-10-13 18:21:56 +03:00
9626a2c7c0 Asynch handling 2023-10-13 18:21:56 +03:00
e936f5b80b IfGridTensor shorthand 2023-10-13 18:21:56 +03:00
ffc0639cb9 Running in HMC tests 2023-10-13 18:21:56 +03:00
c5b43b322c traceProduct eliminates non-contributing intermediate terms 2023-10-13 18:21:56 +03:00
c9c4576237 Improved frontier cshift 2023-10-13 18:21:56 +03:00
bf4369f72d clean up HISQSmear with decltypes 2023-10-12 12:41:06 -06:00
36600899e2 working 7-link; Grid_log; generalShift 2023-10-12 11:11:39 -06:00
b9c70d156b Merge branch 'develop' into hisq_fat_links 2023-10-10 22:44:17 -06:00
eb89579fe7 Merge remote-tracking branch 'origin/develop' into develop 2023-10-10 22:43:51 -06:00
0cfd13d18b 7-link working 2023-10-10 22:41:52 -06:00
e6ed516052 merged 2023-10-08 09:00:37 +02:00
e2a3dae1f2 Option for multiple simultaneous CartesianStencils 2023-10-08 08:58:44 +02:00
2111e7ab5f Run at physical mass 2023-10-06 21:20:21 -04:00
d29abfdcaf Transfer code to Frontier now 2023-10-06 21:03:34 -04:00
a751c42cc5 Checkpoint restore the setup 2023-10-06 21:03:08 -04:00
6a3bc9865e Verbose change 2023-10-06 21:02:04 -04:00
4d5f7e4377 Verbose change 2023-10-06 21:01:37 -04:00
78b117fb78 Comment fix 2023-10-06 21:01:15 -04:00
ded63a1319 Verbose change/pretty print 2023-10-06 21:00:53 -04:00
df3e4d1e9c Return fix 2023-10-06 21:00:21 -04:00
b58fd80379 I/O for coarse op and reorganise multigrid headers 2023-10-06 13:43:46 -04:00
7f6e0f57d0 No IO in file 2023-10-06 13:39:53 -04:00
cae27678d8 gpermute 2023-10-06 13:39:19 -04:00
48ff655bad Slightly less verbose 2023-10-06 10:47:52 -04:00
2525ad4623 Slight clean up 2023-10-06 10:47:32 -04:00
e7020017c5 Reorganise multigrid 2023-10-06 10:47:12 -04:00
eacebfad74 Reorganise multigrid into multiple headers 2023-10-06 10:46:21 -04:00
3bc2da5321 Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1 2023-10-05 16:57:59 -04:00
2d710d6bfd Optimised parameters for 16^3 2023-10-05 16:56:55 -04:00
6532b7f32b Eliminate older inefficient coarsening implementation 2023-10-05 16:56:15 -04:00
7b41b92d99 Only need to bad non-local dimensions 2023-10-05 16:55:48 -04:00
dd557af84b ADEF1 and ADEF2 2 level CG 2023-10-05 16:55:19 -04:00
59b9d0e030 coalesceRead the blockSum 2023-10-05 16:54:48 -04:00
b82eee4733 Hermitian dealing with 2023-10-05 16:54:14 -04:00
6a87487544 Running on Frontier, fix RNG big volume y2k, affecting 5D RNG 2023-10-05 16:50:59 -04:00
fcf5023845 Running on Frontier 2023-10-05 16:50:59 -04:00
c8adad6d8b First runs on Summit. PopulateAdag needs work 2023-10-05 16:50:54 -04:00
737d3ffb98 ADEF1 and 1 hop projection 2023-10-03 14:22:18 -04:00
6d0c2de399 Deprecate teh PVC directory and make a PVC-OEM generic PVC target with
no queueing system dependency -- just interactive scripts
2023-10-03 17:04:20 +00:00
7786ea9921 Bug fix in script 2023-10-03 09:58:44 -07:00
d93eac7b1c Performance regressed and is OK in icpx 2023.2 2023-10-03 15:53:14 +00:00
b01e67bab1 coalescedReadGeneralPermute now working 2023-10-02 17:46:57 -04:00
8a70314f54 Merge branch 'develop' into feature/scidac-wp1 2023-10-02 17:24:55 -04:00
afc316f501 Rename headers 2023-10-02 16:25:11 -04:00
f14bfd5c1b Relocate sub includes 2023-10-02 16:23:38 -04:00
c5f1420dea Merge remote-tracking branch 'LupoA/develop' into LupoA-develop 2023-10-02 16:22:35 -04:00
018e6da872 Merge pull request #440 from giltirn/feature/paddedcellgauge
Feature/paddedcellgauge
2023-10-02 10:00:42 -04:00
b77bccfac2 Merge pull request #444 from mmphys/feature/docX
Update doc complete list of Macports needed to build Grid on a fresh Mac
2023-10-02 09:57:11 -04:00
36ae6e5aba Fastest GPU version.
Need to work on the PaddedCell now to make much faster
2023-09-29 18:26:51 -04:00
9db585cfeb Temporary commit while optimisation is carried out 2023-09-29 17:11:35 -04:00
c564611ba7 Annoying hack that is useful to preserve for profiling 2023-09-29 17:11:12 -04:00
e187bcb85c Updating 2023-09-29 17:10:17 -04:00
be18ffe3b4 Further tuning and lanczos 2023-09-27 16:21:58 -04:00
0d63dce4e2 Timing info 2023-09-27 16:21:14 -04:00
26b30e1551 Flop count and projection to nearest neighbour (keeps redundant flops) 2023-09-27 16:20:11 -04:00
7fc58ac293 Verbose subspace init 2023-09-27 16:19:45 -04:00
3a86cce8c1 Compile 2023-09-27 16:19:18 -04:00
80359e0d49 Bland SYCL compile 2023-09-26 13:20:27 -07:00
3d437c5cc4 Making SYCL happy 2023-09-26 13:19:42 -07:00
37884d369f Coarse space is expensive, but gives a speed up in fine matrix multiplies now.
Down to optimisation
2023-09-25 17:24:19 -04:00
9246e653cd Basic non-local coarsening of operator test 2023-09-25 17:20:58 -04:00
64283c8673 Normal equations becomes linear function for easy base class pass aroudn 2023-09-25 17:19:39 -04:00
755002da9c Comparison convenience 2023-09-25 17:16:33 -04:00
31b8e8b437 Better messaging 2023-09-25 17:16:14 -04:00
0ec0de97e6 Adef2 implemented and working in an HDCG like context 2023-09-25 17:15:03 -04:00
6c3ade5d89 Improved the coarsening 2023-09-25 17:14:40 -04:00
980c5f9a34 Update chebyshev setup 2023-09-25 17:12:22 -04:00
63d9b8e8a3 Merge remote-tracking branch 'origin/develop' into hisq_fat_links 2023-09-16 23:20:31 -06:00
d247031c98 try 7-link 2023-09-16 23:18:16 -06:00
e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
471ca5f281 Power method more iterations 2023-09-07 10:55:05 -04:00
e82ddcff5d Working getting closer to HDCG but some low level engineering work still needed
+ MUCH work on optimisation
2023-09-07 10:53:51 -04:00
b9dcad89e8 Test cases for coarsening with non-local stencil 2023-09-07 10:53:22 -04:00
993f43ef4a Even odd use case 2023-09-07 10:53:06 -04:00
2b43308208 First cut non-local coarsening 2023-08-25 17:38:07 -04:00
04a1ac3a76 First cut for non-local coarsening 2023-08-25 17:37:38 -04:00
990b8798bd Merge remote-tracking branch 'refs/remotes/origin/develop' into develop 2023-08-25 17:36:45 -04:00
b334a73a44 Stencil improvement 2023-08-25 17:35:10 -04:00
5d113d1c70 Odd address sanitizer complain 2023-08-25 17:34:18 -04:00
c14977aeab Random vector option for test purposes 2023-08-25 17:33:31 -04:00
3e94838204 Spread out improvement 2023-08-25 17:31:28 -04:00
c0a0b8ca62 NEON and address sanitiser 2023-08-25 17:30:30 -04:00
b8a7004365 Partial fraction test 2023-08-14 15:17:03 -04:00
affff3865f Merge branch 'develop' into hisq_fat_links 2023-08-11 23:08:04 -06:00
9c22655b5a Merge remote-tracking branch 'origin/develop' into develop 2023-08-11 23:06:42 -06:00
99d879ea7f 5-link first attempt 2023-08-11 22:56:30 -06:00
bd56c95a6f Update documentation with complete list of Macports needed to build Grid on a fresh Mac 2023-07-14 13:50:06 +01:00
994512048e Merge pull request #439 from felixerben/bugfix/IRL_convergence
Bugfix/irl convergence
2023-07-12 16:32:26 -04:00
dbd8bb49dc Merge pull request #32 from LupoA/sp2n/develop
Sp2n/develop
2023-07-04 15:23:43 +00:00
3a29af0ce4 Fixed linker error 2023-07-04 16:08:44 +01:00
f7b79cdd45 Added test for ProjectSpn 2023-07-03 18:00:32 +01:00
075b9d22d0 adjoint rep implemented as 2indx symmetric 2023-07-02 13:58:31 +01:00
b92428f05f better test 2023-07-02 13:34:03 +01:00
34b11864b6 prettiest tests 2023-07-02 13:25:57 +01:00
1dfaa08afb The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent
The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher
2023-06-28 15:11:24 -04:00
9d263d9a7d fix bug in HISQSmearing; move benchmark b/c i don't understand how makefiles work 2023-06-28 10:05:34 -06:00
9015c229dc add benchmark to see whether matrix multiplication is slower than read from object 2023-06-27 21:28:26 -06:00
f44dce390f Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding
Fixed const correctness on PaddedCell methods
Fixed compile issues on Crusher
Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance
Optimized kernel for StaplePadded
Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings
2023-06-27 14:58:10 -04:00
bb71e9a96a Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers
Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header
Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links
2023-06-27 11:23:30 -04:00
78bae9417c returning Nstop vectors even if not all meet true convergence criterion 2023-06-27 14:38:19 +01:00
dd170ead01 whitespace 2023-06-27 11:37:01 +01:00
014704856f do one more iteration if not all vectors converged 2023-06-27 11:33:30 +01:00
a7eabaad56 rudimentary appendShift convenience method, which allows the user to append an arbitrary shift in one line 2023-06-26 23:59:28 -06:00
eeb4703b84 develop wrappers to make the stencils easier to construct 2023-06-26 17:45:35 -06:00
a07421b3d3 Merge branch 'develop' into hisq_fat_links 2023-06-26 13:51:32 -06:00
cda53b4068 Merge remote-tracking branch 'origin/develop' into develop 2023-06-26 13:51:06 -06:00
6f6844ccf1 Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array
Modified plaq+rectangle gauge actions to use the above
Added a test code to confirm the above changes
2023-06-26 15:48:47 -04:00
4c6613d72c Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink
Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs
Removed optimized staple only being used for periodic gauge BCs; it is now always used
2023-06-26 10:20:23 -04:00
ee92e08edb Merge pull request #435 from fjosw/fix/warnings_in_WilsonKernelsImplementation
Unused variable in WilsonKernelsImplementation
2023-06-23 11:47:19 -04:00
c1dcee9328 Merge pull request #437 from fjosw/fix/stencil_debug
Added GridLogDebug to BuildSurfaceList debug message
2023-06-23 11:47:00 -04:00
559257bbe9 better documentation and filelist names 2023-06-23 16:16:48 +01:00
6b150961fe Better script 2023-06-23 18:09:25 +03:00
cff1f8d3b8 rm unused variables and formatting 2023-06-23 16:04:18 +01:00
f27d2083cd adjustments in SUn and Sp2n impl 2023-06-23 15:34:08 +01:00
36cc9c524f Threaded the constructor of GeneralLocalStencil 2023-06-23 09:57:38 -04:00
2822487450 rm unncessary line 2023-06-23 14:55:23 +01:00
e07fafe46a minor adjustments to twoindex 2023-06-23 12:18:04 +01:00
063d290bd8 missing function 2023-06-23 11:11:20 +01:00
4e6194d92a Avoid code duplication in ProjectSUn 2023-06-23 11:03:50 +01:00
de30c4e22a minor improvements 2023-06-23 10:49:41 +01:00
df99f227c1 include missing staple orientations; invert path direction, which was backwards 2023-06-22 14:57:10 -06:00
5bafcaedfa Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-22 19:59:45 +03:00
bfeceae708 FTHMC 2023-06-22 12:58:18 -04:00
eacb66591f Config command 2023-06-22 19:56:40 +03:00
fadaa85626 Update 2023-06-22 19:56:27 +03:00
02a5b0d786 Updating run during testing 2023-06-22 19:52:46 +03:00
0e2141442a Dennis says broken 2023-06-22 19:19:51 +03:00
769eb0eecb Precision coverage 2023-06-22 19:19:20 +03:00
4241c7d4a3 Imported coalescedReadGeneralPermute GPU implementation from Christoph
Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed
Fixed compile issue with pointer cast in padded staple code
Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple
2023-06-21 16:01:01 -04:00
d536c67b9d add HISQSmearing to Smearing.h 2023-06-20 16:04:48 -06:00
f44f005dad rename _lvl1 --> _linkTreatment 2023-06-20 15:48:27 -06:00
26b2caf570 add template parameter to Smear_HISQ_fat for MILC interfacing 2023-06-20 15:37:54 -06:00
7b11075102 The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs)
Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections
Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs
2023-06-20 17:09:56 -04:00
abc658dca5 Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code
In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation
2023-06-20 16:14:25 -04:00
8bb078db25 Merge branch 'develop' into hisq_fat_links 2023-06-20 13:05:00 -06:00
b61ba40023 Merge remote-tracking branch 'origin/develop' into develop 2023-06-20 13:04:53 -06:00
452bf2e907 Accelerator basisRotate also on HIP 2023-06-20 20:36:24 +03:00
2372275b2c Merge pull request #36 from LupoA/sp2n/gpu-bugfix
Sp2n/gpu bugfix [close #30]
2023-06-20 13:46:00 +01:00
ef736e8aa4 Merge pull request #35 from LupoA/sp2n/enableSp
consistent enable sp config flag
2023-06-20 10:41:09 +00:00
5e539e2d54 Forgot some follow-ups on changed signature 2023-06-18 12:37:51 +01:00
96773f5254 Apparently forgot to remove one Lattice version 2023-06-18 12:21:39 +01:00
d80df09f3b consistent enable sp config flag 2023-06-16 19:16:46 +01:00
621e612c30 Fix non-zero ret on device bug 2023-06-16 16:27:49 +01:00
8c3792721b ClangFormat 2023-06-16 15:58:23 +01:00
c95bbd3948 Remove accelerated lattice version 2023-06-16 15:50:26 +01:00
e28ab7a732 Re-included instantiations for symmetric 2Index AS Sp 2023-06-16 14:20:37 +01:00
c797cbe737 deal with post-merge trauma 2023-06-16 14:20:37 +01:00
e09dfbf1c2 definetely the right merge upstream/develop 2023-06-16 14:19:46 +01:00
85e35c4da1 fix: added GridLogDebug to BuildSurfaceList debug message. 2023-06-16 10:31:16 +01:00
d72e914cf0 Profiling temporary code until optimised 2023-06-15 10:43:04 -04:00
3b5254e2d5 Optional checkpoint smeared configs for FTHMC 2023-06-15 10:43:04 -04:00
f1c358b596 Additional tests 2023-06-15 10:43:04 -04:00
c0ef210265 Hot start should be properly Hot 2023-06-15 10:43:04 -04:00
e3e1cc1962 Ta project 2023-06-15 10:43:04 -04:00
723eadbb5c Keep methods virtual 2023-06-15 10:43:04 -04:00
e24637ec1e Clean up 2023-06-15 10:43:04 -04:00
8b01ff4ce7 Integrator over to smeared force structure 2023-06-15 10:43:04 -04:00
588197c487 Smeared action virtual class 2023-06-15 10:43:04 -04:00
116d90b0ee First attempt on #30 2023-06-15 15:09:37 +01:00
b0646ca187 Remove some unused variables 2023-06-15 15:09:09 +01:00
1352bad2e4 Sunspot compile 2023-06-15 11:22:46 +00:00
14d352ea4f added smearParams struct 2023-06-12 16:55:44 -06:00
1cf9ec1cce now compiles 2023-06-09 16:27:45 -06:00
4895ff260e Merge pull request #28 from LupoA/sp2n/config
compile sp2n fermion impl only if declared at config time
2023-06-09 13:07:48 +00:00
4b994a1bc7 trouble with compilation 2023-06-08 17:37:25 -06:00
e506d6d369 Merge branch 'develop' into hisq_fat_links 2023-06-07 21:16:20 -06:00
ab56ad8d7a fix 3-link stencil 2023-06-07 21:14:58 -06:00
470d93006a compile sp2n fermion impl only if declared at config time 2023-06-07 12:53:33 +01:00
2f3d03f188 Merge pull request #27 from LupoA/sp2n/documentation
documentation for gaugegroup and sp2n
2023-06-01 16:42:27 +00:00
8db7c23bee improve documentation 2023-06-01 17:39:10 +01:00
69dc5172dc Merge pull request #26 from LupoA/sp2n/irreps
Sp2n/irreps
2023-06-01 16:28:15 +00:00
fd72eb6546 Merge branch 'sp2n/algorithm' into sp2n/irreps 2023-06-01 17:24:01 +01:00
ffd7301649 Updated masked / fthmc smeared config container 2023-06-01 06:23:02 -04:00
d2a8494044 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-01 06:22:33 -04:00
0982e0d19b Jacobian action wrapper for FTHMC 2023-06-01 06:15:08 -04:00
3badbfc3c1 Refactor the Action and Smeared gauge configuration containers. Add first pass at FTHMC action 2023-06-01 06:14:28 -04:00
5465961e30 New test for FTHMC portion 2023-06-01 06:14:04 -04:00
477b794bc5 fix: unused variable removed. 2023-05-29 14:08:53 +01:00
e8c29e2fe5 Merge pull request #31 from paboyle/develop
Sync
2023-05-28 16:13:12 +02:00
4835fd1a87 HIP stream synch 2023-05-27 17:58:22 +03:00
6533c25814 Lumi 2023-05-27 16:13:32 +03:00
b405767569 make private methods private 2023-05-26 17:02:16 +01:00
fe88a0c12f cleaner twoindex class, cleaner tests 2023-05-26 16:55:30 +01:00
e61a9ed2b4 partial revert 2023-05-26 13:54:26 +01:00
de8daa3824 group is SUn by default 2023-05-26 13:44:41 +01:00
3a50fb29cb directly call sp helper 2023-05-26 13:28:47 +01:00
6647d2656f rm unnecessary specialisation 2023-05-26 12:27:22 +01:00
a6f4dbeb6d remove redundant template parameter 2023-05-26 12:13:40 +01:00
92a282f2d8 Merge pull request #24 from LupoA/sp2n/fix_static_assert_symmetric
Move static_assert inside of function
2023-05-26 11:13:50 +01:00
ca2fd9fc7b documentation for gaugegroup and sp2n 2023-05-25 18:40:54 +01:00
3825329f8e Merge branch 'develop' into hisq_fat_links 2023-05-24 15:37:25 -06:00
be1a4f5860 implement TwoIndexSymm for sp2n 2023-05-22 17:21:03 +01:00
1b2914ec09 FT-HMC smearing, derivative chain rule, log det and force first pass. 2023-05-22 10:21:37 -04:00
519f795066 Header not liked by gcc on mac? puzzling 2023-05-22 10:21:12 -04:00
5897b93dd4 debug tests, fix dimension 2023-05-22 13:42:21 +01:00
af091e0881 DimensionHelper for 2index irreps 2023-05-21 16:56:06 +01:00
3c1e5e9517 Merge pull request #25 from LupoA/sp2n/unify_representations
Sp2n/unify representations [close #3]
2023-05-21 14:55:27 +01:00
85b2cb7a8a changing some hardcoded SUn lines 2023-05-21 14:50:28 +01:00
c7bdf2c0e4 3-link test at least gives an answer 2023-05-21 04:33:20 -06:00
4240ad5ca8 Preparing for FTHMC 2023-05-19 21:21:55 -04:00
d418347d86 public for convenience to see rho params 2023-05-19 21:21:05 -04:00
29a4bfe5e5 Clean up 2023-05-19 21:20:45 -04:00
9955bf9daf Regresses to Qlat 2023-05-19 17:32:13 -04:00
da9cbfc7cc Suppress BuildSurfaceList verbosity in Stencil.h 2023-05-19 20:22:20 +02:00
6b9f07c1ed Merge pull request #30 from paboyle/develop
Merge upstream
2023-05-19 20:20:58 +02:00
b8bdc2eefb Unified two index representations 2023-05-18 18:36:29 +01:00
0078826ff1 Move static_assert inside of function 2023-05-18 18:14:53 +01:00
e855c41772 Unified spfundamental.h with fundamental.h 2023-05-18 18:11:20 +01:00
d169c275b6 Merge pull request #22 from LupoA/sp2n/unify_twoindex
Unify TwoIndex
2023-05-18 14:55:02 +00:00
a5125e23f4 Typo 2023-05-18 15:41:35 +01:00
7b83c80757 Merge branch 'sp2n/unify_twoindex' of github.com:LupoA/Grid into sp2n/unify_twoindex 2023-05-18 15:36:14 +01:00
e41821e206 Disable two index symmetric 2023-05-18 15:29:55 +01:00
bf91778550 verbose plaquette example; fat link test frame 2023-05-17 15:15:54 -06:00
5a75ab15a2 typo in 2S dim 2023-05-17 20:47:57 +01:00
932c783fbf 2AS for every Nc! 2023-05-17 20:22:05 +01:00
55f9cce577 Revert "Added automated HMC test for Nc=4"
This reverts commit eee27b8b30.
2023-05-17 09:17:48 +01:00
b3533ca847 correct tests (failing) 2023-05-16 17:43:52 +01:00
fd2a637010 test 2index 2023-05-16 14:10:39 +01:00
eee27b8b30 Added automated HMC test for Nc=4 2023-05-15 18:37:33 +01:00
8522352aa3 ClangFormat 2023-05-15 18:36:05 +01:00
3beb8f4091 fixing typo, getting pre-changes physics 2023-05-15 16:00:15 +01:00
12a706e9b1 de-hardcode the number of generators 2023-05-15 15:48:21 +01:00
170aa7df01 fix (dimension to be improved) 2023-05-15 15:20:18 +01:00
e8ad1fef53 Unify TwoIndex 2023-05-12 14:35:50 +01:00
876c8f4478 Nodes on padded cell 2023-05-11 12:35:49 -04:00
9c8750f261 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-05-11 12:29:09 -04:00
91efd08179 Option for Qlat generator basis 2023-05-11 12:27:45 -04:00
9953511b65 Mac compile 2023-05-11 12:27:29 -04:00
025fa9991a For FTHMC 2023-05-11 12:26:14 -04:00
e8c60c355b Padded cell code 2023-05-11 12:25:50 -04:00
6c9c7f9d85 Permute fix 2023-05-11 12:24:21 -04:00
f534523ede Debug 2023-05-11 12:23:11 -04:00
1b8a834beb Debug 2023-05-11 12:22:24 -04:00
aa9df63a05 rename group projections based on determinants 2023-05-10 14:50:52 +01:00
3953312a93 Merge pull request #20 from LupoA/sp2n/unify_gaugeimpltypes
Sp2n/unify gaugeimpltypes
2023-05-03 15:17:10 +00:00
6e62f4f616 ClangFormat 2023-05-03 16:15:12 +01:00
6a7bdca53b Take over additional algebra tests from Alessandro 2023-05-03 16:02:02 +01:00
c7fba9aace Take over additional group tests from Alessandro 2023-05-03 16:01:48 +01:00
ac6c7cb8d6 Merge in Alessandro's changes [test fails] 2023-05-03 02:53:03 +01:00
c5924833a1 ClangFormat 2023-05-03 02:39:36 +01:00
ac0a74be0d Taken care of algebra tests 2023-05-03 02:32:42 +01:00
42b0e1125d Naming and argument types 2023-05-03 01:51:46 +01:00
339c4fda79 Extracted is_element_of Sp2n 2023-05-02 15:44:34 +01:00
9b85bf9402 better projection test 2023-05-02 15:42:20 +01:00
86b02c3cd8 cleaning up requested by Julian 2023-05-02 13:31:17 +01:00
7b3b7093fa cleaning up requested by Ed 2023-05-02 12:50:57 +01:00
881b08a465 Correct implementation of SpTa 2023-04-27 18:17:06 +01:00
3ee5444c69 Remove commented out stuff 2023-04-21 08:08:18 +01:00
5e28fe56d2 Remove code duplication: Iterating through vectors 2023-04-21 08:08:06 +01:00
3aa43e6065 Debug info 2023-04-20 14:21:13 -04:00
78ac4044ff HMC 2023-04-20 13:28:07 -04:00
119c3db47f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-18 15:13:16 -04:00
21bbdb8fc2 Crusher 2023-04-18 15:11:16 -04:00
5aabe074fe Rename Sympl* to Sp* 2023-04-18 11:50:20 +01:00
739bd7572c Example code 2023-04-17 21:51:55 +00:00
074627a5bd Pass file descriptors through AF_UNIX for level_zero 2023-04-17 21:50:52 +00:00
6a23b2c599 Drop UVM 2023-04-17 21:49:58 +00:00
dace904c10 fix typo 2023-04-14 18:06:18 +01:00
be98d26610 small change I missed in previous commit 2023-04-13 17:48:43 +01:00
bd891fb3f5 tests to compile 2023-04-12 18:32:44 -04:00
3984265851 Merge pull request #432 from paboyle/hotfix/nvcc-warnings
Unused statements generating warnings removed
2023-04-12 16:59:02 -04:00
45361d188f Merge pull request #427 from fjosw/feat/bug_report_issue_template
Feat/bug report issue template
2023-04-12 16:58:41 -04:00
80c9d77e02 Merge pull request #433 from paboyle/hotfix/virtual-dtor
Virtual destructor for LinearOperator
2023-04-12 16:56:18 -04:00
3aff64dddb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-11 12:19:15 -07:00
b4f2ca81ff Copy queue and compute queue same as better concurrency 2023-04-11 12:18:21 -07:00
d1dea5f840 New driver 2023-04-11 12:16:52 -07:00
54f8b84d16 Fence 2023-04-11 12:16:08 -07:00
da503fef0e Name change on barrier routine 2023-04-11 12:14:04 -07:00
4a6802098a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-07 15:43:28 -04:00
f9b41a84d2 Trajectory runs to completion on Crusher within wall clock time 2023-04-07 15:42:45 -04:00
5d7e0d18b9 virtual destructor for LinearOperator 2023-04-07 14:30:38 +01:00
9e64387933 mores unused statements removed 2023-04-07 14:27:18 +01:00
983b681d46 unused statement cleaning 2023-04-07 14:12:02 +01:00
4072408b6f Update README.md 2023-04-07 11:45:28 +01:00
bd76b47fbf Update CI badge in README 2023-04-07 11:44:48 +01:00
5f75735dab Add M and Mdag to WilsonTMFermion 2023-04-06 18:25:05 +02:00
178376f24b minor stylistic changes 2023-04-06 12:08:17 +01:00
18ce23aa75 Fix NEON SIMD 2023-04-06 11:30:48 +01:00
6a0eb466ee Merge pull request #19 from LupoA/refactoring_sp2n
refactoring sp2n
2023-04-05 10:50:58 +00:00
ffa7fe0cc2 Merge branch 'feature/dirichlet' into develop 2023-04-04 23:13:52 -04:00
6b979f0a69 Dirichlet improvements that I failed to commit 2023-04-04 23:13:17 -04:00
4ea29b8f0f Template group into GaugeImplTypes. Closing #2 2023-04-04 17:49:28 +01:00
778291230a expand ProjecOnGaugeGroup, change ProjectOnSp2nAlgebra into SpTa, fixing some of its issues 2023-04-04 17:48:13 +01:00
86dac5ff4f Better printing 2023-04-04 07:42:19 -07:00
4a382fad3f Use distinct SYCL queue for copies 2023-04-04 07:41:41 -07:00
cc753670d9 Barrier elimination, surface list build 2023-04-04 07:39:14 -07:00
cc9d88ea1c Fence changes and EXT kernel loop cout reduction 2023-04-04 07:37:23 -07:00
b281b0166e Put the barrier in the subroutine 2023-04-04 07:36:03 -07:00
6a21f694ff Apply barrier in Gather kernel sequence.
Could place before comms, or in Gather, but decided to insist Gather means Gather is done
2023-04-04 07:33:24 -07:00
fc4db5e963 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-04-03 18:26:11 -04:00
6252ffaf76 No unified 2023-04-03 18:25:22 -04:00
026e736dfa Projection on algebra can now be templated. Fix #12 2023-04-03 16:31:19 +01:00
4275b3f431 Fix typo and remove unnecessary lines 2023-04-03 12:01:52 +01:00
af64c1c6b6 Had managed to drop the accelerator_barrier() in the Wilson Compressor gather 2023-03-30 17:34:44 -04:00
866f48391a Temporary fix for develop incorrect results 2023-03-30 17:10:13 -04:00
a4df527d74 Merge pull request #428 from mmphys/bugfix/comm_none
Fixes for --enable-comms=none
2023-03-30 08:38:14 -04:00
5764d21161 Fixes for --enable-comms=none 2023-03-30 10:15:28 +01:00
496d04cd85 Weaken the Fence 2023-03-29 18:58:51 -04:00
10e6d7c6ce Merge branch 'feature/dirichlet' into develop 2023-03-29 16:26:47 -04:00
c42e25e5b8 Dirichlet remove 2023-03-29 16:25:52 -04:00
a00ae981e0 Fence propagation from SYCL 2023-03-29 15:00:40 -04:00
58e020b62a Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-29 14:37:40 -04:00
a7e1aceeca Compile fix on Nvidia 2023-03-29 14:36:50 -04:00
7212432f43 More careful fencing 2023-03-28 20:10:22 -07:00
4a261fab30 Changes premerge to develop 2023-03-28 20:04:21 -07:00
6af97069b9 Preparing for close of feature/dirichlet
Initial code change review complete
2023-03-28 13:39:44 -07:00
5068413cdb Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 08:35:38 -07:00
71c6960eea Commet 2023-03-28 08:34:24 -07:00
ddf6d5c9e3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 11:33:05 -04:00
39214702f6 feat: indentation fixed. 2023-03-28 16:30:34 +02:00
3e4614c63a feat: draft for bug-report issue template added. 2023-03-28 16:24:35 +02:00
900e01f49b Temporary 2023-03-27 21:35:06 -07:00
2376156fbc Merge branch 'develop' into feature/dirichlet 2023-03-27 21:33:50 -07:00
3f2fd49db4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-03-27 17:29:54 -07:00
0efa107cb6 Script update 2023-03-27 17:29:43 -07:00
8feedb4f6f Include files moved 2023-03-27 17:29:21 -07:00
05e562e3d7 Move the copy synch out to stencil and do one per call instead of one per packet 2023-03-27 17:28:38 -07:00
dd3bbb8fa2 MOve the synchronise out to the stencil so one call instead of one call per packet 2023-03-27 17:27:45 -07:00
2fbcf13c46 SYCL fix 2023-03-27 14:25:14 -07:00
4ea48ef0c4 Merge pull request #419 from lehner/feature/gpt
Separate rankSum from sum
2023-03-24 15:42:16 -04:00
5c85774ee3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-24 15:40:57 -04:00
d8a9a745d8 stream synchronise 2023-03-24 15:40:30 -04:00
dcf172da3b Merge pull request #415 from paboyle/feature/block_lanczos22
Feature/block lanczos22
2023-03-24 12:08:16 -04:00
d57ed25071 Merge branch 'feature/dirichlet' into feature/block_lanczos22 2023-03-24 12:08:09 -04:00
546be724e7 Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane
Populate the Cshift_table in the GPU
2023-03-24 12:04:06 -04:00
8a1b9073f9 Mshift update 2023-03-23 15:39:30 -04:00
1a7114d4b9 Temporary algorithm while sorting out mixed prec 2023-03-23 15:38:35 -04:00
3f385f717c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts:
	systems/PVC/benchmarks/run-2tile-mpi.sh
	systems/PVC/config-command
2023-03-23 14:52:53 -04:00
481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
c180a52518 Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-23 10:28:01 -04:00
90130e25e9 TODO list 2023-03-23 10:27:02 -04:00
23298acb81 Merge pull request #424 from giltirn/feature/dirichlet-precchange
Precision change implementation
2023-03-22 23:04:52 -04:00
52384e34cf Discard on construct 2023-03-22 19:40:32 -04:00
d0bb033ea2 Device resident GPU block buffer instead of UVM as hit likely UVM
bug. Code worked on CUDA 11.4 but fails on later drivers (certainly 530.30.02, but need to
find the perlmutter driver version).
2023-03-22 19:07:32 -04:00
c6621806ca Compiling on laptop and running 2023-03-21 17:27:09 -04:00
0b6f0f6d2f Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:06:55 -04:00
b5b759df73 Merge branch 'develop' into feature/dirichlet 2023-03-21 16:05:46 -04:00
7db8dd7a95 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:04:27 -04:00
8b43be39c0 Config command 2023-03-21 16:00:52 -04:00
f17f879206 Test update 2023-03-21 15:59:29 -04:00
68428fceab Integrator update 2023-03-21 15:58:49 -04:00
4135f2dcd1 Compressor 2023-03-21 15:41:41 -04:00
c5bdf61215 AUdit fix 2023-03-21 15:38:39 -04:00
88e218e8ee Stencil updates 2023-03-21 15:37:58 -04:00
0f2b786436 Vector -> vector 2023-03-21 15:36:11 -04:00
e1c326558a COmms improvements 2023-03-21 08:53:56 -07:00
bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
39c0815d9e WriteDiscard 2023-03-21 08:57:29 -04:00
1b8176e2c0 fix code duplication 2023-03-17 14:58:00 +00:00
cbc053c3db Revert "projection on Sp2n algebra, to be used instead of Ta"
This reverts commit ba7f9d7b70.
2023-03-17 11:36:58 +00:00
cdf3f6ef6e Merge branch 'refactoring_sp2n' of https://github.com/LupoA/Grid into refactoring_sp2n 2023-03-15 15:59:50 +00:00
ba7f9d7b70 projection on Sp2n algebra, to be used instead of Ta 2023-03-15 15:55:12 +00:00
a997d24743 Remove nofma 2023-03-14 12:10:31 -07:00
861e5d7f4c SYCL version update. Why do they keep making incompatible changes 2023-03-14 12:10:02 -07:00
14cc142a14 Warning remove 2023-03-14 12:09:26 -07:00
f36b87deb5 syscall fix 2023-03-14 12:09:00 -07:00
eeb6e0a6e3 Renable cache blocking and efficient UPI type SHM comms 2023-03-14 09:10:27 -07:00
cad5b187dd Cleanup 2023-03-14 09:08:16 -07:00
87697eb07e SHared compile 2023-03-14 09:07:36 -07:00
371fd123fb consequence of iSUnMatrix being no longer a member of the SU class 2023-03-14 10:47:07 +00:00
d6ff644aab Towards the day all tests compile 2023-03-14 10:43:25 +00:00
29586f6b5e Deactivate some tests for Nc!=3 2023-03-13 08:17:14 +00:00
fd057c838f add ProjectOnGaugeGroup and ProjectGn to allow future templating in GaugeImplTypes 2023-03-10 12:10:46 +00:00
f51222086c Move functions from GaugeGroup to group specific implementations 2023-03-09 16:22:20 +00:00
a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
83d86943db Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string 2023-02-23 13:09:45 -05:00
e82cf1d311 Further prec-change improvements
Mixed prec CG algorithm has been modified to precompute precision change workspaces

As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test
2023-02-23 09:45:29 -05:00
1db58a8acc Precision change improvements
Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation.

In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes.

Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces

Renamed the original precisionChange as precisionChangeOrig

Fixed incorrect pointer offset bug in copyLane

Added a test and a benchmark for precisionChange

Added a test for reliable update CG
2023-02-21 10:52:42 -05:00
920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
f73691ec47 Merge pull request #18 from nickforce989/sp2n/newbranch
Sp2n/newbranch
2023-02-13 10:22:27 +01:00
ccd21f96ff Plaquette agreeing and moving to final form (slowly) need to optimise 2023-02-01 22:57:44 -05:00
4b90cb8888 First cut passes combining padded cell with general stencil towards fast plaquette and staggered force 2023-02-01 22:14:10 -05:00
7ebda3e9ec Merge commit 'b10e1b7bc8bec809f874e9e48a3ccc7b2619c9d1' into sp2n/newbranch 2023-01-19 12:10:18 +00:00
b10e1b7bc8 Fixed files giving zero force computation on GPU, issue #8 2023-01-18 18:04:47 +00:00
796abfad80 Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT
Disable diagnostic pragma warnings for CUDA 12+
2023-01-17 09:34:49 -05:00
ad0270ac8c fix: diagnostic pragma warnings fixed for CUDA 12+ 2023-01-12 12:36:30 +00:00
7d62f1d6d2 Populate the Cshift_table in the GPU
Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
2023-01-11 21:26:25 +00:00
458c943987 merged upstream 2022-12-31 11:16:21 +02:00
88015b0858 Split sum in rankSum and GlobalSum 2022-12-26 10:01:32 +01:00
4ca1bf7cca Added gauge invariance test 2022-12-21 07:23:16 -05:00
2ff868f7a5 CPU open doesn't need to free space 2022-12-20 05:10:23 -05:00
ede02b6883 Memory manager debug Felix case 2022-12-20 05:10:23 -05:00
1822ced302 Bug fix 2022-12-20 05:10:23 -05:00
37ba32776f More logging 2022-12-20 05:10:23 -05:00
99b3697b03 More loggin 2022-12-20 05:10:23 -05:00
43a45ec97b SSC_START 2022-12-20 05:10:23 -05:00
b00a4142e5 A=A fix 2022-12-20 05:10:23 -05:00
3791bc527b Logging pulled in from dirichlet branch 2022-12-20 05:10:23 -05:00
d7dea44ce7 Merge pull request #17 from chillenzer/unify_gauge_groups
Fix compilation error in nvcc (closes #15)
2022-12-19 16:24:03 +00:00
d8c29f5fcf Updated FFT test for PETSc 2022-12-18 12:05:00 -05:00
37b6b82869 Fix file extensions 2022-12-18 16:12:56 +00:00
92ad5b8f74 Compiler error fix: NVCC requires names for templ. par. 2022-12-18 15:50:19 +00:00
281f8101fe Matt FFT test 2022-12-17 20:35:33 -05:00
472ed2dd5c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-12-17 20:17:09 -05:00
4f85672674 Simpler test for PETSc 2022-12-17 20:16:11 -05:00
dc747c54be Merge branch 'develop' into feature/dirichlet
Conflicts:
	Grid/qcd/action/fermion/WilsonCompressor.h
	Grid/stencil/Stencil.h
2022-12-13 08:24:58 -05:00
140684d706 Head to head vs HMC 2022-12-13 08:15:38 -05:00
5bb7ba92fa Test for DDHMC force term 2022-12-13 08:15:11 -05:00
b54d0f3c73 Smaller deltaH down to 7000s on t=0.5 trajectory 2022-12-13 08:14:27 -05:00
ff6777a98d Variable depth experiments 2022-12-13 08:13:51 -05:00
07acfe89f2 Merge pull request #417 from rrhodgson/feature/fermtoprop
Feature/fermtoprop
2022-12-06 12:45:03 -05:00
40234f531f FermToProp accelerator_for -> thread_for 2022-12-06 17:34:51 +00:00
d49694f38f PropToFerm fix 2022-12-06 15:48:54 +00:00
8c80f1c168 Merge pull request #14 from chillenzer/unify_gauge_groups
Unify gauge groups (closes #5)
2022-12-01 17:35:46 +00:00
dc6a38f177 Minor cleanup 2022-11-30 17:13:12 -05:00
82c1ecf60f Block lanczos added 2022-11-30 16:08:40 -05:00
67f569354e Partial dirichlet changes 2022-11-30 15:51:13 -05:00
97a098636d FermToProp 2022-11-30 15:36:35 -05:00
e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
0af7d5a793 Rename Grid/qcd/utils/<Group>_impl.h -> Grid/qcd/utils/<Group>.h 2022-11-30 17:12:00 +00:00
505fa49983 Renamed SUn.h -> GaugeGroup.h 2022-11-30 17:09:48 +00:00
7bcf33def9 Removed Sp2n.h 2022-11-30 16:59:46 +00:00
a13820656a Removed iSUnMatrix, etc. 2022-11-30 15:09:03 +00:00
fa71b46a41 Hide nsp 2022-11-30 14:44:23 +00:00
b8b3ae6ac1 Make helper functions private 2022-11-30 13:29:14 +00:00
55c008da21 Removed forward declaration 2022-11-30 13:12:21 +00:00
2507606bd0 With function overloading (still dirty). 2022-11-30 12:54:36 +00:00
7c2ad4f8c8 Attempt with SFINAE (failed) 2022-11-30 11:57:39 +00:00
54c8025aad Remove unnecessary pwd in scripts/filelist 2022-11-28 17:50:38 +00:00
921e23e83c Separated out everything SU specific 2022-11-28 17:47:50 +00:00
6e750ecb0e Remove apparently forgotten file 2022-11-28 16:33:46 +00:00
b8f1f5d2a3 Introduce GaugeGroup 2022-11-25 17:45:32 +00:00
9273f2937c Autoformat google style 2022-11-25 17:44:08 +00:00
1aa28b47ae Add existing test to check 2022-11-25 17:40:40 +00:00
629cb2987a Fix typo in Makefile.am 2022-11-25 17:40:21 +00:00
03235d6368 Fixed type in configure.ac 2022-11-25 16:57:40 +00:00
22064c7e4c Fixing #11 2022-11-25 13:10:29 +00:00
5fa573dfd3 partial send fix 2022-11-25 00:51:04 -05:00
f6402cb6c4 AUDIT removal 2022-11-25 00:50:33 -05:00
bae6c263dc Audit 2022-11-25 00:47:01 -05:00
d71672dca9 Bug fix 2022-11-25 00:46:35 -05:00
121c9e2ceb Tracing 2022-11-25 00:45:21 -05:00
63a30ae34f Tracing 2022-11-25 00:45:05 -05:00
7d8231ba32 Tracing 2022-11-25 00:44:57 -05:00
b690b1cbe9 Audit 2022-11-25 00:43:57 -05:00
c0fb20fc03 Audit check for wrongly locked data 2022-11-25 00:43:12 -05:00
bc9579dac6 Old code path removed 2022-11-25 00:40:45 -05:00
a5c77f8b95 Tracing moved in order 2022-11-25 00:40:27 -05:00
2de03e5172 Revert "Revert "Fixing issue #11: consistent use of ncolour and nsp""
This reverts commit 3af4929dda.
2022-11-23 19:40:28 +00:00
3af4929dda Revert "Fixing issue #11: consistent use of ncolour and nsp"
This reverts commit 1ba429345b.
2022-11-23 19:34:59 +00:00
1ba429345b Fixing issue #11: consistent use of ncolour and nsp 2022-11-23 18:45:01 +00:00
3dbfce5223 Tests clean build on HIP 2022-11-16 20:15:51 -05:00
e51eaedc56 Making tests compile 2022-11-15 22:58:30 -05:00
e2a938e7f7 GPU happy for compile...? 2022-11-15 17:48:18 -05:00
ddad25211b Extra instantiations 2022-11-15 17:47:52 -05:00
6209120de9 Fix to GPU compile attempt 2022-11-15 17:25:58 -05:00
fe6e8f5ac6 Benchmark_comms fix 2022-11-15 17:00:49 -05:00
ee84dcb400 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-11-15 16:41:55 -05:00
0ae0e5f436 Partial Dirichlet test 2022-11-15 16:40:38 -05:00
e047616571 Multilevel integrator test 2022-11-15 16:39:39 -05:00
1af7572c61 Some test HMCs for DDHMC 2022-11-15 16:38:51 -05:00
653039695b Partial dirichlet changes 2022-11-15 16:37:15 -05:00
ca62abd203 Record some perturbative free field calculation 2022-11-15 16:36:46 -05:00
e74666a09c Double length vector type for fast precision change 2022-11-15 16:34:21 -05:00
45a001e078 Debug compile 2022-11-15 16:27:20 -05:00
0352da34f0 Several deleted files 2022-11-15 16:26:49 -05:00
7d302a525d Natural place for this routine is here 2022-11-15 16:24:55 -05:00
e2e269e03b Partial dirichlet BCs 2022-11-15 16:24:26 -05:00
0db4f1803f Partial dirichlet support 2022-11-15 16:23:41 -05:00
5fe480d81c Generic patch 2022-11-15 16:21:45 -05:00
0566fc6267 Partial Dirichlet 2022-11-15 16:21:24 -05:00
a11c12e2e7 Modifications for partial dirichlet BCs 2022-11-15 16:20:01 -05:00
0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
1873101362 PVC 2022-11-08 13:22:45 -08:00
63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
006268f556 DWF Slow version 2022-11-02 20:24:51 -04:00
78acae9b50 Simple DWF for easy check 2022-11-02 20:24:17 -04:00
a3927a8a27 Dirichlet 2022-11-02 20:22:27 -04:00
d9dd9a5b5f LLVM update 2022-11-02 19:51:50 -04:00
eae1c02111 Bounds check 2022-11-02 19:50:32 -04:00
132d841b05 Compile fix 2022-11-02 19:33:22 -04:00
62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
2e8c3b0ddb Slow implementation of Shamir DWF 2022-10-18 18:10:01 -04:00
991667ba5e Revert 2022-10-13 18:50:35 -04:00
8a07b52009 Dirichlet 2022-10-13 18:44:47 -04:00
2bcff94b52 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-13 18:42:04 -04:00
d089739e2f Hack for lattice sites 2022-10-13 17:55:50 -04:00
204c283e16 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-11 14:59:07 -04:00
551a5f8dc8 RRII gpu option 2022-10-11 14:44:55 -04:00
c82b164f6b Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-10-04 17:41:48 -04:00
584a3ee45c Merge pull request #412 from giltirn/patch/adaptive-wflow
Patch/adaptive wflow
2022-10-04 17:23:19 -04:00
eec0c9eb7d Merge pull request #411 from giltirn/patch/dirichlet-fixes
Various fixes / changes
2022-10-04 17:22:01 -04:00
477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
66d001ec9e Refactored Wilson flow class; previously the class implemented both iterative and adaptive smearing, but only the iterative method was accessible through the Smearing base class. The implementation of Smearing also forced a clunky need to pass iterative smearing parameters through the constructor but adaptive smearing parameters through the function call. Now there is a WilsonFlowBase class that implements common functionality, and separate WilsonFlow (iterative) and WilsonFlowAdaptive (adaptive) classes, both of which implement Smearing virtual functions.
Modified the Wilson flow adaptive smearing step size update to implement the original Ramos definition of the distance, where previously it used the norm of a difference which scales with the volume and so would choose too coarse or too fine steps depending on the volume. This is based on Chulwoo's code.

Added a test comparing adaptive (with tuneable tolerance) to iterative Wilson flow smearing on a random gauge configuration.
2022-10-03 10:59:38 -04:00
fad2f969d9 Summit up to date 2022-09-27 10:58:43 -04:00
48165c1dc1 Ticked off a few items 2022-09-27 10:58:00 -04:00
25df2d2c3b Various precision options 2022-09-27 10:57:12 -04:00
af9ecb8b41 Current tests compiling 2022-09-27 10:56:55 -04:00
234324599e Double2 2022-09-27 10:56:10 -04:00
97448a93dc Double2 compiles and dslash runs 2022-09-27 10:55:25 -04:00
70c83ec3be More instantiations 2022-09-27 10:54:23 -04:00
8f4e2ee545 Double2 2022-09-27 10:53:46 -04:00
e8bfbf2f7c D2 operators 2022-09-27 10:37:45 -04:00
9e81b42981 D2 fields 2022-09-27 10:37:19 -04:00
6c9eef9726 D2 fields 2022-09-27 10:36:54 -04:00
7ffbc3e98e Double2 improved. REally don't like 'convertType' - localise to a GPT
header
2022-09-27 10:35:31 -04:00
68e4d833dd Run through wrapper script 2022-09-23 16:49:29 -04:00
a2cefaa53a Faster 2022-09-23 16:49:14 -04:00
a0d682687e Better logging of Fdt for force gradient 2022-09-23 16:22:53 -04:00
eb552c3ecd dt info 2022-09-23 16:22:28 -04:00
97cce103d7 Tolerances control 2022-09-23 16:21:49 -04:00
87ac7104f8 Prettier 2022-09-23 16:20:46 -04:00
e4c117aabf Compile fix, multishift mixed prec support 2022-09-23 16:19:27 -04:00
5b128a6f9f MixedPrec Multishift with better precision scheme for GPU 2022-09-23 16:18:47 -04:00
19da647e3c Added support for non-periodic gauge field implementations in the random gauge shift performed at the start of the HMC trajectory
(The above required exposing the gauge implementation to the HMC class through the Integrator class)
Made the random shift optional (default on) through a parameter in HMCparameters
Modified ConjugateBC::CshiftLink such that it supports any shift in  -L < shift < L rather than just +-1
Added a tester for the BC-respecting Cshift
Fixed a missing system header include in SSE4 intrinsics wrapper
Fixed sumD_cpu for single-prec types performing an incorrect conversion to a single-prec data type at the end, that fails to compile on some systems
2022-09-09 12:47:09 -04:00
1713de35c0 Improved config flags 2022-09-05 21:50:02 -04:00
1177b8f661 Merge branch 'develop' into feature/dirichlet 2022-08-31 19:05:57 -04:00
442bfb3d42 Merge branch 'develop' into feature/dirichlet 2022-08-31 19:04:19 -04:00
e7d9b75fdd Warning fixes 2022-08-31 19:01:14 -04:00
3d0e3ec363 Tracing 2022-08-31 18:31:46 -04:00
3c1c51f9aa Merge branch 'feature/dirichlet-gparity' into feature/dirichlet 2022-08-31 18:25:34 -04:00
8cc3c522c3 Merge pull request #409 from giltirn/feature/dirichlet-gparity-stage
Import round 5
2022-08-31 18:22:50 -04:00
913fbca74a Merge pull request #410 from gkanwar/photon_and_sha_patches
Photon.h and SHA256 patches
2022-08-31 18:01:45 -04:00
5c87342108 Used in g-2 sign off 2022-08-31 17:35:32 -04:00
66177bfbe2 Used in g-2 sign off 2022-08-31 17:35:07 -04:00
5205e68963 RocTX, NVTX, text based self profiling 2022-08-31 17:34:09 -04:00
cd5cf6d614 Tracing replaces self timing hooks 2022-08-31 17:33:41 -04:00
5abb19eab0 Remove self timing 2022-08-31 17:32:49 -04:00
06d7b88c78 Force reporting improved 2022-08-31 17:32:21 -04:00
cf72799735 Better action naming 2022-08-31 17:24:11 -04:00
cdb8fcc269 Width=4 support. This is too broad; hit it on physical point run.
Need to change strategy, I think.
2022-08-31 17:21:33 -04:00
b4f4130901 Defer SMP node links until after interior. Allows for DMA overlapping
compute
2022-08-31 17:20:21 -04:00
bb049847d5 Tracing replaces self timing 2022-08-31 17:19:02 -04:00
fd33c835dd Feynman rule fix and tracing replaces self timing 2022-08-31 17:18:17 -04:00
21371a7e5b Tracing replaces self timing 2022-08-31 17:16:05 -04:00
abfaa00d3e Tracing replaces self timing 2022-08-31 17:15:24 -04:00
efee33c55d Tracing replaces self timing 2022-08-31 17:14:57 -04:00
db0fe6ddbb Tracing replaces self timinng 2022-08-31 17:14:14 -04:00
8a9e647120 Tracing replaces self timing 2022-08-31 17:13:44 -04:00
e6dcb821ad Tracing replaces self timing 2022-08-31 17:12:31 -04:00
9bff188f02 Tracing replaces self timing 2022-08-31 17:12:05 -04:00
111b30ca1d Tracing replaces self timing 2022-08-31 17:11:48 -04:00
24182ca8bf HIP allows conserved currents.
Tracing replaces self timeing
2022-08-31 17:11:18 -04:00
ee2d7369b3 Tracing replaces self timing 2022-08-31 17:10:45 -04:00
7c686d29c9 Tracing replaces self timing 2022-08-31 17:10:17 -04:00
e8a0a1e75d Tracing replaces self timing hooks 2022-08-31 17:09:47 -04:00
730be89abf Remove timing hooks as tracing replaces 2022-08-31 17:08:44 -04:00
f991ad7d5c Remove timing hooks as tracing replaces 2022-08-31 17:08:18 -04:00
b3f33f82f7 Decrease self timing hooks, use nvtx / roctx type tracing hooks instead 2022-08-31 17:06:47 -04:00
a34a6e059f Logging improvement. Sinitial will be used to improve RHMC terms 2022-08-31 17:06:08 -04:00
1333319941 Tracing 2022-08-31 17:00:25 -04:00
9295ed8d20 Print full memory range 2022-08-31 16:59:51 -04:00
19cc7653fb Tracing 2022-08-31 16:57:51 -04:00
5752538661 Tracing 2022-08-31 16:57:32 -04:00
ca40a1b00b Tracing 2022-08-31 16:54:55 -04:00
659fac9dfb Tracing hook 2022-08-31 16:54:25 -04:00
4dc3d6fce0 Buy into Nvidia/Rocm etc... tracing. 2022-08-31 16:53:19 -04:00
60dfb49afa Remove FP16 tests when FP16 is disabled 2022-08-21 17:29:55 +02:00
554c238359 Update OpenSSL digest to use high-level methods
This avoids deprecation warnings when compiling against OpenSSL 3.0
but should still be backwards compatible. It is the recommended way
to use the digest API going forward.
2022-08-21 17:28:57 +02:00
f922adf05e Fix Photon ComplexField type 2022-08-21 16:16:18 +02:00
95b640cb6b 10TF/s on 32^3 x 64 on single node 2022-08-04 15:43:52 -04:00
2cb5bedc15 Copy stream HIP improvements 2022-08-04 15:24:03 -04:00
806b02bddf Simplify dead code 2022-08-04 15:23:13 -04:00
de40395773 More timing. Think I should start to use nvtx and rocmtx ?? 2022-08-04 13:37:16 -04:00
7ba4788715 Fix 2022-08-04 13:36:44 -04:00
06d9ce1a02 Synch ranks on node here for GPU - GPU memcopy 2022-08-04 13:35:56 -04:00
75bb6b2b40 Move barrier into the StencilSend begin routine 2022-08-04 13:35:26 -04:00
74f10c2dc0 Move barrier into Stencil Send 2022-08-04 13:34:11 -04:00
188d2c7a4d PVC default, ignore ATS 2022-08-02 08:38:53 -07:00
17d7177105 Files for SYCL 2022-08-02 08:33:39 -07:00
bb0a0da47a inon blocking caution due to SYCL 2022-08-02 08:09:43 -07:00
84110166e4 Fix the fence 2022-08-02 08:00:43 -07:00
d32b923b6c Fencing on a stream in SYCL is needed. Didn't know that ... gulp 2022-08-02 07:58:04 -07:00
a93d5459d4 Better mpi request completion 2022-07-28 12:18:35 -04:00
9c21add0c6 High res timer replaces getttimeofday 2022-07-28 12:14:03 -04:00
639aab6563 High res timer instead of gettimeofday 2022-07-28 12:13:35 -04:00
8137cc7049 Allways concurrent comms 2022-07-28 12:01:51 -04:00
60e63dca1d Add memory logging channel 2022-07-28 11:39:15 -04:00
486409574e Expanded cach to avoid any allocs in HMC 2022-07-28 11:38:34 -04:00
a913b8be12 Dslash self timing. Might want to not have this 2022-07-28 11:37:55 -04:00
2239751850 Better logging 2022-07-28 11:37:36 -04:00
9b20f1449c Better timing 2022-07-28 11:37:12 -04:00
b99453083d Updated timing 2022-07-28 11:37:02 -04:00
2ab1af5754 Ensure no synchronize and not optoin dependent 2022-07-19 09:51:06 -07:00
5f8892bf03 Mistake pointed out by Camilo 2022-07-19 09:31:51 -07:00
f14e7e51e7 Grid accelerator 2022-07-12 10:56:22 -07:00
943fbb914d Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-07-11 13:48:42 -04:00
ca4603580d Verbose 2022-07-11 13:48:35 -04:00
f73db8f1f3 Synch clocks 2022-07-11 13:47:39 -04:00
f7217d12d2 World barrier for clock synch 2022-07-11 13:45:31 -04:00
fab50c57d9 More loggin 2022-07-11 18:42:27 +01:00
3440534fbf MixedPrec support 2022-07-10 21:35:18 +01:00
177b1a7ec6 Mixed prec 2022-07-10 21:34:10 +01:00
58182fe345 Different approach to default dirichlet params 2022-07-10 21:32:58 +01:00
1f907d330d Different default params for dirichlet 2022-07-10 21:31:48 +01:00
b0fe664e9d Better force log info 2022-07-10 21:31:25 +01:00
c0f8482402 Remove SSC marks 2022-07-07 17:49:36 +01:00
3544965f54 Stream doesn't work 2022-07-07 17:49:20 +01:00
33e4a0caee Imported changes from feature/gparity_HMC branch:
Rework of WilsonFlow class
		Fixed logic error in smear method where the step index was initialized to 1 rather than 0, resulting in the logged output value of tau being too large by epsilon
		Previously smear_adaptive would maintain the current value of tau as a class member variable whereas smear would compute it separately; now both methods maintain the current value internally and it is updated by the evolve_step routines. Both evolve methods are now const.
		smear_adaptive now also maintains the current value of epsilon internally, allowing it to be a const method and also allowing the same class instance to be reused without needing to be reset
		Replaced the fixed evaluation of the plaquette energy density and plaquette topological charge during the smearing with a highly flexible general strategy where the user can add arbitrary measurements as functional objects that are evaluated at an arbitrary frequency
	        By default the same plaquette-based measurements are performed, but additional example functions are provided where the smearing is performed with different choices of measurement that are returned as an array for further processing
		Added a method to compute the energy density using the Cloverleaf approach which has smaller discretization errors
	Added a new tensor utility operation, copyLane, which allows for the copying of a single SIMD lane between two instances of the same tensor type but potentially different precisions
	To LocalCoherenceLanczos, added the option to compute the high/low eval of the fine operator on every restart to aid in tuning the Chebyshev
	Added Test_field_array_io which demonstrates and tests a single-file write of an arbitrary array of fields
	Added Test_evec_compression which generates evecs using Lanczos and attempts to compress them using the local coherence technique
	Added Test_compressed_lanczos_gparity which demonstrates the local coherence Lanczos for G-parity BCs
	Added HMC main programs for the 40ID and 48ID G-parity lattices
2022-07-01 14:12:12 -04:00
1f903d9296 Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-07-01 12:12:50 -04:00
4df1e0987f Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity 2022-07-01 09:55:43 -04:00
588c2f3cb1 Faster axpy_norm and innerProduct 2022-07-01 09:44:58 -04:00
bd99fd608c Introduce a non-default stream for compute operatoins 2022-07-01 09:42:53 -04:00
57b442d0de Log memory operations 2022-07-01 09:42:17 -04:00
751a4562d7 Timing improvement 2022-07-01 09:41:43 -04:00
ca66301dee Remove debug 2022-06-30 14:53:12 -04:00
808bb59206 Mixed prec DD-RHMC 2022-06-30 13:50:09 -04:00
4b7f51d19d Create a new RNG file 2022-06-30 13:49:50 -04:00
d03152fac4 New file under debug 2022-06-30 13:49:35 -04:00
137f190258 Dirichlet implementation 2022-06-30 13:45:07 -04:00
53d01312b3 Rough flop counting, need to add M5D, M5Ddag, MooeeInv flops 2022-06-30 13:44:09 -04:00
220050822a Speed up M5D and M5Ddag 2022-06-30 13:43:27 -04:00
87ad76d81b Initialise timeval 2022-06-30 13:42:46 -04:00
042ab1a052 Update GridStd.h 2022-06-27 13:21:39 -04:00
4ac1094856 Updated config commands 2022-06-27 12:16:24 -04:00
d44a57b0af Allow frequency=0 to disable 2022-06-27 12:15:55 -04:00
dc000d10ee Spelling correction 2022-06-27 12:14:57 -04:00
3685f391cf More verbose CG 2022-06-27 12:11:08 -04:00
efd7338a00 Allow dirichlet at round the world link 2022-06-27 12:10:27 -04:00
e1e7b1e224 RNG fix 2022-06-27 12:09:52 -04:00
7319d4e1ad Merge pull request #407 from giltirn/feature/dirichlet-gparity-stage
Import round 4
2022-06-22 15:23:36 -04:00
fd933420c6 Imported changes from feature/gparity_HMC branch:
Added a bounds-check function for the RHMC with arbitrary power
	Added a pseudofermion action for the rational ratio with an arbitrary power and a mixed-precision variant of the same. The existing one-flavor rational ratio class now uses the general class under the hood
	To support testing of the two-flavor even-odd ratio pseudofermion, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field
	Added a new HMC runner start type: CheckpointStartReseed, which reseeds the RNG from scratch, allowing for the creation of new evolution streams from an existing checkpoint. Added log output of seeds used when the RNG is seeded.
	EOFA changes:
		To support mixed-precision inversion, generalized the class to maintain a separate solver for the L and R operators in the heatbath (separate solvers are already implemented for the other stages)
		To support mixed-precision, the action of setting the operator shift coefficients is now maintained in a virtual function. A derived class for mixed-precision solvers ensures the coefficients are applied to both the double and single-prec operators
		The ||^2 of the random source is now stored by the heatbath and compared to the initial action when it is computed. These should be equal but may differ if the rational bounds are not chosen correctly, hence serving as a useful and free test
		Fixed calculation of M_eofa (previously incomplete and #if'd out)
		Added functionality to compute M_eofa^-1 to complement the calculation of M_eofa (both are equally expensive!)
		To support testing, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field
	Added a test program which computes the G-parity force using the 1 and 2 flavor implementations and compares the result. Test supports DWF, EOFA and DSDR actions, chosen by a command line option.
	The Mobius EOFA force test now also checks the rational approximation used for the heatbath
	Added a test program for the mixed precision EOFA compared to the double-prec implementation,
	G-parity HMC test now applied GPBC in the y direction and not the t direction (GPBC in t are no longer supported) and checkpoints after every configuration
	Added a test program which computes the two-flavor G-parity action (via RHMC) with both the 1 and 2 flavor implementations and checks they agree
	Added a test program to check the implementation of M_eofa^{-1}
2022-06-22 10:27:48 -04:00
8208a6214f Merge branch 'feature/dirichlet-gparity' into feature/dirichlet 2022-06-15 19:23:48 -04:00
3d8146b596 Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity 2022-06-15 19:20:27 -04:00
31efa5c4da Script updates for current summit 2022-06-15 19:19:44 -04:00
d10d30dda8 Script update 2022-06-15 19:18:58 -04:00
0e9666bc92 Test update 2022-06-15 19:18:42 -04:00
6efd80f104 Printing 2022-06-15 18:23:46 -04:00
fdef7a1a8c Dirichlet fix 2022-06-15 00:05:20 -04:00
501bb117bf Const correct 2022-06-15 00:04:09 -04:00
05ca7dc252 Const correctness 2022-06-14 23:41:05 -04:00
e9648a1635 Useful periodic print. CG convergence bound is remarkably accurate on
low eigenvalue in numerical tests
2022-06-14 23:40:04 -04:00
2df98a99bc Merge pull request #406 from giordano/patch-1
Update default value of gen-simd-width in README
2022-06-14 17:46:25 -04:00
315ea18be2 Update default value of gen-simd-width in README 2022-06-14 22:41:05 +01:00
9a9f4a111f Merge pull request #405 from giltirn/feature/dirichlet-gparity-stage
Import round 3
2022-06-06 18:45:37 -04:00
1ad54d049d To PeriodicBC and ConjugateBC, added a new function "CshiftLink" which performs a boundary-aware C-shift of links or products of links. For the latter, the links crossing the global boundary are complex-conjugated.
To the gauge implementations, added CshiftLink functions calling into the appropriate operation for the BC in a given direction.
GaugeTransform, FourierAcceleratedGaugeFixer and WilsonLoops::FieldStrength no longer implicitly assume periodic boundary conditions; instead the shifted link is obtained using CshiftLink and is aware of the gauge implementation.
Added an assert-check to ensure that the gauge fixing converges within the specified number of steps.
Added functionality to compute the timeslice averaged plaquette
Added functionality to compute the 5LI topological charge and timeslice topological charge
Added a check of the properties of the charge conjugation matrix C=-gamma_2 gamma_4 to Test_gamma
Fixed const correctness for Replicate
Modified Test_fft_gfix to support either conjugate or periodic BCs, optionally disabling Fourier-accelerated gauge fixing, and tuning of alpha using cmdline options
2022-06-02 15:30:41 -04:00
57bd0a0a22 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-06-01 19:29:38 -04:00
b49db84b08 Slurm updates 2022-06-01 19:27:42 -04:00
583f7c52f3 SSC mark 2022-06-01 19:27:29 -04:00
58a86c9164 SSC mark removal 2022-06-01 19:27:06 -04:00
a25b32847f Crusher patch 2022-06-01 19:26:37 -04:00
6f1a2e132b SSC mark causing problems 2022-06-01 19:26:06 -04:00
b1ede7b46d Faster RNG init 2022-06-01 19:25:42 -04:00
e762c940c2 Reduce the loop over exterior for GPU to indirection table 2022-06-01 14:29:25 -07:00
6a1a198144 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-29 11:08:09 -04:00
34faa39f4f Clean up Dirichlet. Big oops fix 2022-05-28 17:18:08 -07:00
5ddea3829d Extra easier signature for peek 2022-05-28 15:52:39 -07:00
7eb29cf529 MPI fix 2022-05-28 15:51:34 -07:00
f729b9b889 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-25 14:16:09 -04:00
4f997c5f04 Remove extra face kernels in Dirichlet 2022-05-25 11:15:25 -07:00
a9c2e1df03 Merge pull request #404 from rrhodgson/feature/json_nvcc
Feature/json nvcc
2022-05-25 13:30:11 -04:00
d3496d2fe0 Merge pull request #397 from giltirn/feature/dirichlet-gparity-stage
Gparity HMC import round 2
2022-05-25 13:29:45 -04:00
60f4cb0ffd Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-25 12:38:10 -04:00
136d843ce7 Crusher updates 2022-05-25 12:36:09 -04:00
18028f4309 Merge branch 'develop' into feature/dirichlet 2022-05-24 18:26:18 -07:00
5164016740 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-05-24 18:25:57 -07:00
d83beaa890 Update perlmutter 2022-05-24 18:25:00 -07:00
f9f05e995b Update perlmutter 2022-05-24 18:24:38 -07:00
e651b9e7ab Clean up stencil with better intranode Dirichlet / DDHMC support.
14TF/s on a Perlmutter node
2022-05-24 18:23:39 -07:00
47b4e91473 Verbose change 2022-05-24 18:19:18 -07:00
3f31afa4fc Clean up verbose 2022-05-24 18:18:51 -07:00
da4daea57a Updated json to latest release 3.10.5 2022-05-24 16:16:06 +01:00
af3b065add Merge pull request #403 from fjosw/fix/cuda_11_5_warnings
Fixed nvcc 11.5+ warnings
2022-05-24 11:10:02 -04:00
e346154c5d Updated json CUDA compile guards 2022-05-24 15:48:01 +01:00
7937ac2bab fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc 2022-05-24 15:31:03 +01:00
e909aeedf0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h 2022-05-24 15:29:42 +01:00
bab8aa8eb0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT
standard in DisableWarnings.h
2022-05-24 15:27:40 +01:00
38b22f05be Merge pull request #402 from fjosw/fix/clover_warnings
fixed clover warnings
2022-05-24 10:05:27 -04:00
3ca0de1c40 Fix json write for vector<string> 2022-05-24 14:37:33 +01:00
c7205d2a73 Removed nvcc guards for json 2022-05-24 14:30:26 +01:00
617c5362c1 fix: fixed warning: missing return statement at end of non-void function
in CloverHelpers
2022-05-24 11:37:33 +01:00
083b58e66d Merge pull request #401 from JPRichings/LocalCoheranceDeflation
Local coherance batch deflation
2022-05-20 11:44:22 -04:00
633427a2df Merge pull request #400 from JPRichings/wilson_sweep
bench wilson sweep fix
2022-05-20 11:43:40 -04:00
2031d6910a Merge branch 'paboyle:develop' into wilson_sweep 2022-05-20 16:20:23 +01:00
f82ce67624 Dirichlet improved 2022-05-19 19:17:11 -07:00
b52e8ef65a Dirichlet changes 2022-05-19 16:45:41 -07:00
2594e3c230 Dirichlet option 2022-05-19 16:45:19 -07:00
8cedb45af2 Dirichlet BCs 2022-05-19 16:45:02 -07:00
aa008cbe99 Updated for new Dirichlet interface 2022-05-19 16:44:39 -07:00
79e34b3eb4 Local Coherence batch deflation 2022-05-19 14:53:17 +01:00
4f3d581ab4 Merge branch 'paboyle:develop' into LocalCoheranceDeflation 2022-05-19 14:46:17 +01:00
6fb6ca5b6b Merge branch 'develop' into feature/dirichlet 2022-05-17 09:09:00 -07:00
b8ee19691c Updated config for PM 2022-05-17 09:08:12 -07:00
d16427b837 Merge pull request #399 from fjosw/fix/Nc_neq_3
fix: assert for dimensions of compact Wilson clover moved to constructor
2022-05-17 09:03:42 -04:00
4b1997e2f3 wilson sweep test 2022-05-16 15:58:33 +01:00
8939d5dc73 bugfix: eo operator called in correct location 2022-05-16 00:28:28 +01:00
b051e00de0 Additional Local Coherance Deflation operator() 2022-05-16 00:25:13 +01:00
8aa75b492f Merge branch 'develop' into fix/Nc_neq_3 2022-05-10 14:22:03 +01:00
0274f40686 Merge pull request #389 from mbruno46/mbruno-eclover
Feature/expClover
2022-05-10 09:18:19 -04:00
77aa147ce5 Merge branch 'develop' into mbruno-eclover 2022-05-10 09:16:53 -04:00
32facbd02a fix: assert for dimensions of compact Wilson clover moved to
constructor.
2022-05-10 10:53:22 +01:00
6121397587 Imported changes from feature/gparity_HMC branch:
Added storage of final true residual in mixed-prec CG and enhanced log output
	Fixed const correctness of multi-shift constructor
	Added a mixed precision variant of the multi-shift algorithm that uses a single precision operator and applies periodic reliable update to the residual
	Added tests/solver/Test_dwf_multishift_mixedprec to test the above
	Fixed local coherence lanczos using the (large!) max approx to the chebyshev eval as the scale from which to judge the quality of convergence, resulting a test that always passes
	Added a method to local coherence lanczos class that returns the fine eval/evec pair
	Added iterative log output to power method
	Added optional disabling of the plaquette check in Nerscio to support loading old G-parity configs which have a factor of 2 error in the plaquette
	G-parity Dirac op no longer allows GPBC in the time direction; instead we toggle between periodic and antiperiodic
	Replaced thread_for G-parity 5D force insertion implementation with accelerator_for version capable of running on GPUs
	Generalized tests/lanczos/Test_dwf_lanczos to support regular DWF as well as Gparity, with the action chosen by a command line option
	Modified tests/forces/Test_dwf_gpforce,Test_gpdwf_force,Test_gpwilson_force to use GPBC a spatial direction rather than the t-direction, and antiperiodic BCs for time direction
	tests/core/Test_gparity now supports using APBC in time direction using command line toggle
2022-05-09 16:27:57 -04:00
4de50ab146 Merge pull request #396 from fjosw/fix/readd_config.h
fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am
2022-05-09 08:26:48 -04:00
8b12a61097 fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am 2022-05-09 11:53:22 +01:00
79ea027c0b Merge pull request #377 from RJHudspith/develop
NERSC and ILDG for non-SU(3) configuration checkpoints
2022-05-03 08:55:48 -04:00
62339d437f Merge pull request #387 from lehner/feature/gpt
Parity mass terms for domain wall fermions to enable 4d eofa
2022-05-03 08:52:18 -04:00
698e745276 Merge pull request #390 from fjosw/feature/conserved_current_wilson
Conserved current for wilson fermions
2022-05-03 08:51:10 -04:00
0417b96896 Merge pull request #391 from giltirn/feature/dirichlet-gparity-stage
First stage of import
2022-05-03 08:50:18 -04:00
9a6e2c315d Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge
SteepestDescentGaugeFix now exits when the algorithm does not converge.
2022-05-03 08:49:26 -04:00
e61fed87db SteepestDescentGaugeFix now exits when the algorithm does not converge.
This behaviour can be altered by setting err_on_no_converge to false.
2022-04-20 15:41:55 +01:00
81fe4c937e Hopefully fix link errors on Intel compilers due to having no function body for MomentumFilterBase::apply_phase 2022-04-12 09:51:59 -04:00
f77f3a6598 Imported G-parity flavor algebra + tester from feature/gparity_HMC branch 2022-04-06 10:21:04 -04:00
239afb18fb Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-04-05 16:49:32 -04:00
ef820a26cd Bcopy on crusher compile 2022-04-05 16:49:02 -04:00
65abe4d0d3 Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-04-05 16:26:54 -04:00
5012adfebf Merge branch 'develop' into feature/dirichlet 2022-04-05 16:26:19 -04:00
b808d48fa1 Tone down printing in integrator 2022-04-05 16:25:22 -04:00
83f818a99d Updates for DDHMC 2022-04-05 16:24:34 -04:00
b8bc560b51 Test_wilson_conserved_current implemented, all 5d references removed. 2022-04-05 17:33:45 +01:00
6bc2483d57 Merge branch 'feature/eclover' into feature/conserved_current_wilson 2022-04-05 15:26:49 +01:00
82aecbf4cf Test_wilson_conserved_current added 2022-04-05 15:26:39 +01:00
ee23a76aa0 Merge pull request #2 from fjosw/feature/eclover
Feature/eclover
2022-04-05 13:30:13 +02:00
d7191e5a02 SeqConservedCurrent implemented for Wilson fermions 2022-04-05 11:48:56 +01:00
c8a824425b Error message added if another conserved current than vector is requested for
Wilson type fermions.
2022-04-05 10:58:22 +01:00
f23626a6b8 End scope by additional block in CloverHelpers.h 2022-04-02 16:08:15 +01:00
6577a03d16 Explcitly closed views in Exponentiate_Clover 2022-04-01 18:39:12 +01:00
427c8695fe Change signs and prefactors for conserved current to mimic the 5d
version.
2022-04-01 16:20:21 +01:00
9e82c468ab Multiplication of diagonal mass in exponentiate fixed for gpus 2022-04-01 15:54:43 +01:00
603fd96747 Missing link multiplication added. 2022-04-01 10:58:56 +01:00
fe993c0836 /=2 replaced by *=0.5 2022-03-31 17:08:17 +01:00
cdf31d52c1 GaugeGrid and typo fixed 2022-03-31 17:04:35 +01:00
0542eaf1da First version of conserved current contraction for Wilson type quarks 2022-03-31 17:02:09 +01:00
317bdcf158 nerscio parametrization 2022-03-24 13:10:47 +01:00
387397374a Current run options 2022-03-23 16:35:11 -04:00
9ca2c98882 Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover 2022-03-22 15:31:37 +01:00
605cf401e1 Merge branch 'feature/sumd-npr' into develop 2022-03-16 22:43:12 +00:00
f99c3660d2 Merge branch 'feature/cpu-threaded-smp' into develop 2022-03-16 22:07:54 +00:00
92a83a9eb3 Performance improve for Tesseract 2022-03-16 17:14:36 +00:00
53ae01a34a Merge pull request #1 from fjosw/feature/eclover
Feature/eclover
2022-03-15 15:23:35 +01:00
b615fa0f35 Merge pull request #388 from fjosw/feature/sumd-npr
Feature/sumd npr
2022-03-15 09:05:57 -04:00
76c294a7ba open bc fix 2022-03-08 13:55:16 +01:00
0c0c2b1e20 Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed. 2022-03-08 09:44:51 +00:00
e2fc3a0f04 Merge pull request #28 from paboyle/develop
Sync with Upstream
2022-03-08 09:58:51 +01:00
451e7972fd Reintroduced explicit inversion of the Clover term in case of the
CompactExpClover because of the open boundary O(a) improvement. Changed
the timing output to GridLogDebug
2022-03-07 17:43:33 +00:00
56c089d347 Removed leftover comments 2022-03-07 16:40:20 +00:00
acf740e44d Merge pull request #1 from FelixPGZiegler/feature/eclover
Feature/eclover
2022-03-07 16:25:11 +00:00
182f513404 Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover 2022-03-07 15:22:04 +00:00
d5b2323a57 included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover 2022-03-07 14:44:24 +00:00
bad18d4417 Merge branch 'paboyle:develop' into feature/eclover 2022-03-07 13:54:10 +00:00
bb5c16b97f New scripts 2022-03-03 17:00:37 -05:00
0d80eeb545 small DDHMC update 2022-03-03 16:56:02 -05:00
d1decee4cc Cleaned up unused variables in Lattice_reduction_gpu.h 2022-03-02 16:54:23 +00:00
d4ae71b880 sum_gpu_large and sum_gpu templates added. 2022-03-02 15:40:18 +00:00
b0f4eee78b New files 2022-03-01 19:09:13 -05:00
5340e50427 HMC running with new formulation 2022-03-01 17:10:25 -05:00
e16fc5b2e4 Threaded intranode comms transfer - ideally between NUMA domains 2022-03-01 11:17:24 -05:00
694306f202 Configure for mac arm 2022-03-01 10:53:44 -05:00
9aac1e6d64 Merge branch 'develop' into feature/sumd-npr 2022-03-01 10:51:38 -05:00
3e882f555d Large / small sumD options 2022-03-01 08:54:45 -05:00
438caab25f generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed. 2022-02-27 18:27:18 +00:00
239e2c1ee6 tests: wilson clover cg tests now include compact variant as well as
exponential wilson clover operators
2022-02-27 18:26:34 +00:00
013dc2ef33 tests: core tests for wilson clover and wilson exp clover including
compact version extended/added
2022-02-27 18:13:47 +00:00
9616811c3d Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2022-02-24 22:03:05 +01:00
8a3002c03b separate left and right masses for CayleyFermion5D 2022-02-24 22:02:56 +01:00
0f1c5b08a1 Dirichlet filters running on AMD and now integrated in Fermion op 2022-02-23 19:29:28 -05:00
70988e43d2 Passes multinode dirichlet test with boundaries at
node boundary or at the single rank boundary
2022-02-23 01:42:14 -05:00
71034f828e attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig 2022-02-23 01:02:27 +01:00
aab3bcb46f Dirichlet first cut - wrong answers on dagger multiply.
Struggling to get a compute node so changing systems
2022-02-22 19:58:33 +00:00
11437930c5 cleaned up definitions of wilsonclover fermions 2022-02-22 10:45:16 +01:00
3d44aa9cb9 cleaned up cloverhelpers; fixed test compact_clover which runs 2022-02-22 01:10:19 +01:00
2851870d70 expClover support via helpers template class 2022-02-22 00:05:43 +01:00
da06d15f73 Merge branch 'feature/feature/staggered-comms' into develop 2022-02-17 04:58:50 +00:00
e8b1251b8c Staggered fix finished 2022-02-17 04:51:13 +00:00
63dbaeefaa Extra barrier prior to finalize just in case it fixes an issue on Tursa 2022-02-16 14:01:43 +00:00
e8c187b323 SyCL happier? 2022-02-15 11:24:38 -05:00
fad5a74a4b Bug fix to detection case 2022-02-15 10:27:39 -05:00
e83f6a6ae9 Merge branch 'develop' into feature/feature/staggered-comms 2022-02-15 08:52:39 -05:00
0c1618197f Faster intranode MPI works now 2022-02-15 08:52:07 -05:00
f49d5c2d22 Updated scripts for crusher 2022-02-14 17:55:16 -05:00
a3b022d469 Crusher compile 2022-02-14 15:09:08 -05:00
48772f0976 Merge pull request #384 from jdmaia/hip_launchbounds
Changing thread block order and adding launch_bounds
2022-02-14 11:08:28 -05:00
c322420580 Dont instantiate an Nc=3 and non-GP hardwired code for other implementations 2022-02-14 16:04:08 +00:00
6283d11d50 Add the comment line to tell the existance of copied data/buffer 2022-02-08 15:22:06 +00:00
86f4e17928 Changing thread block order and adding launch_bounds 2022-02-07 11:29:37 -06:00
6616d5d090 Commit 2022-02-02 16:38:24 -05:00
215df671be Merge pull request #382 from DanielRichtmann/feature/compact-clover
Compact Clover Fermions
2022-02-01 21:45:38 -05:00
1b6b12589f Get splitting up into implementation and instantiation files correct 2022-02-02 00:51:11 +01:00
3082ab8252 Check in compact version of wilson clover fermions 2022-02-02 00:50:05 +01:00
add86cd7f4 Abandon ET for clover application, use construct similar to multLink 2022-02-01 23:09:06 +01:00
0b6fd20c54 Enable memory coalescing in clover term generation 2022-02-01 23:09:06 +01:00
e83423fee6 Refactor clover to align with other files and prepare for upcoming changes 2022-02-01 23:09:06 +01:00
b4f8e87982 Have Grid's cli interface understand floats 2022-02-01 23:09:06 +01:00
135808dcfa Less verbose 2021-12-07 16:24:24 -05:00
7f7d06d963 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-12-07 09:06:42 -08:00
2bf3b4d576 Update to reduce memory footpring in benchmark test 2021-12-07 09:02:02 -08:00
0bd83cdbda Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled. 2021-11-28 21:51:03 +01:00
f34d34bd17 2 nodes 2021-11-22 22:27:16 -05:00
e32d5141b4 Updated to make MPI reliable still gives good perf, but MPI will be slow
intranode
2021-11-22 21:46:31 -05:00
6d5277f2d7 Update to Spock 2021-11-22 20:58:02 -05:00
14d82777e0 Best modules for spock 2021-11-22 20:47:16 -05:00
2a4e739513 Enable XGMI copy (need to rename nvlink to cover NVLINK/XGMI/XeLink) 2021-11-22 20:46:09 -05:00
8079dc2a14 Cray MPI not working right yet 2021-11-22 20:45:44 -05:00
6ceb556684 Intranode asynch hipMemCopy 2021-11-22 20:45:12 -05:00
76cde73705 HIP improvements on messaging and intranode hipMemCopyAsynch 2021-11-22 20:44:39 -05:00
cc094366a9 Merge pull request #375 from JPRichings/develop
Lattice object ACCcache probe
2021-11-09 18:19:32 -05:00
41a575ff9b Format edit 2021-11-09 21:56:23 +00:00
12ef413065 fix to deflation.h 2021-11-09 21:20:36 +00:00
829a328451 remove deflation timing 2021-11-09 20:46:57 +00:00
402523c62e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-11-09 12:57:40 +00:00
d7bef70b5c Helper functions to allow probe of cache state of lattice objects. 2021-11-09 12:57:09 +00:00
2ad1811642 Added timing to deflation code. 2021-11-09 12:33:25 +00:00
88bdd4344b 2indx antisymm representation of sp2n 2021-11-04 18:27:35 +00:00
a65a497bae Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-29 13:01:34 +01:00
b27b12828e reverse previous "fix", missing statement was probably intentional, added a comment to that effect 2021-10-29 13:01:31 +01:00
42d56ea6b6 Verbosity 2021-10-29 02:23:08 +01:00
0b905a72dd Better reduction for GPUs 2021-10-29 02:22:22 +01:00
fe9edf8526 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-29 02:03:27 +01:00
44204c7e06 Extra code 2021-10-29 02:02:56 +01:00
33b3789598 Merge pull request #364 from AndrewYongZhenNing/develop
CayleyFermion5D Conserved current fix
2021-10-27 20:27:20 -04:00
195ab2888d Merge branch 'develop' into develop 2021-10-27 20:26:57 -04:00
85f750d753 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-27 00:28:05 +01:00
a4ce6e42c7 Warning free compile on make all and make tests under nvcc 2021-10-27 00:27:03 +01:00
5398b7e7e3 Max 128 size 2021-10-26 09:16:29 -07:00
fd13a3f2be Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-26 10:45:46 +01:00
c144b32368 deflation timers 2021-10-26 10:37:24 +01:00
4044536eea add projection on sp2n algebra 2021-10-26 10:20:44 +01:00
4d8ae6221c fix projection 2021-10-22 10:44:54 +01:00
ba7e371b90 Warning free compile on Tursa.
Hopefully got all reqd virtual dtors
2021-10-21 19:56:52 +01:00
99e7a5d18a Merge pull request #371 from edbennett/hmc-documentation-update
update documentation for GenericHMCRunner - thanks
2021-10-18 14:36:43 -04:00
f824d99059 update documentation for GenericHMCRunner 2021-10-18 09:50:16 +01:00
749b8022a4 Linear operator and SparseMatrix virtual destructors 2021-10-15 20:47:18 +01:00
7e0057d2c4 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-15 20:46:51 +01:00
cfe9e870d3 Stream 2021-10-15 20:46:44 +01:00
e9c4f06cbf Merge pull request #370 from fjosw/bugfix/gpu_sum_shm
Error Handling sum_Dgpu large objects
2021-10-14 09:12:47 -04:00
1f9688417a Error message added when attempting to sum object which is too large for
the shared memory
2021-10-13 20:45:46 +01:00
4e31e4e094 Better tests 2021-10-13 15:07:23 +01:00
0d6674e489 hot start for sp2n 2021-10-12 18:53:54 +01:00
b145fd4f5b necessary to merge 2021-10-12 17:08:46 +01:00
8a5b794f25 necessary change to merge with upstrm 2021-10-12 16:04:03 +01:00
291e80f88a sp2n as config option 2021-10-12 16:00:32 +01:00
1ace5850ae first hmc 2021-10-12 16:00:32 +01:00
283f14b7c1 fix sp2n projection 2021-10-12 16:00:32 +01:00
1d6e708083 tests! 2021-10-12 16:00:32 +01:00
89457e25e3 sp fermion instantiation 2021-10-12 16:00:32 +01:00
7e3b298d3d project on sp2n 2021-10-12 16:00:32 +01:00
7ff3e5eed4 gauge and fermion implementation for sp2n 2021-10-12 16:00:32 +01:00
19eb51cf41 sp2n generators 2021-10-12 15:53:33 +01:00
470d4dcc6d sp2n as config option 2021-10-12 15:47:56 +01:00
ed03bfd555 first hmc 2021-10-12 12:16:47 +01:00
8c0fbcccae fix sp2n projection 2021-10-12 12:12:16 +01:00
d4866157fe tests! 2021-10-12 09:06:15 +01:00
16c2a99965 Overlap cudamemcpy - didn't set up stream right 2021-10-11 13:31:26 -07:00
b6496b6cb5 sp fermion instantiation 2021-10-11 16:32:10 +01:00
4f5fe57920 project on sp2n 2021-10-11 16:28:15 +01:00
11fb943b1e gauge and fermion implementation for sp2n 2021-10-11 16:21:25 +01:00
cda915a345 Better options 2021-10-07 20:29:09 +01:00
7c16189e16 Merge pull request #368 from Heinrich-BR/develop
Accelerated Pick-Set Checkerboard functions
2021-10-07 15:13:09 -04:00
ecbfccea43 Merge pull request #369 from paboyle/gauge-group-covariance
expose gauge group in GImpl and generic Nc fix
2021-10-07 15:11:12 -04:00
a8eda8f6da Summit scripts 2021-10-05 21:22:10 -04:00
9b1a0653cf Summit results 2021-10-05 21:22:01 -04:00
7cb1ff7395 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 20:13:42 -04:00
ab6ea29913 Print removal 2021-10-05 20:13:25 -04:00
b5c81a02b6 Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-05 21:13:01 +01:00
d899ee80fc skip record fixed to include norm metadata 2021-10-05 21:12:47 +01:00
4016e705fc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 14:56:57 -04:00
2f4e85e5d6 Summit set up 2021-10-05 14:56:17 -04:00
8ed0b57b09 Memory verbose and tracking, shrink default cache
Print PCI device IDs on node 0
2021-10-05 11:41:03 -04:00
046a23121e sp2n generators 2021-10-05 15:51:22 +01:00
a976fa6746 expose gauge group in GImpl and generic Nc fix 2021-10-05 14:19:47 +01:00
6c66b8d997 deflated guesser can optionally be used with less vectors than provided 2021-09-30 19:25:12 +01:00
9523ad3d73 vector version of Schur solver use vector guesser 2021-09-28 12:45:47 +01:00
73a95fa96f LinearFunction loops over vectors by default, can be overloaded 2021-09-28 12:44:26 +01:00
7e130076d6 Fixed line left behind 2021-09-24 17:26:31 +01:00
6efdad6f21 Removed Halo benchmark 2021-09-24 17:18:04 +01:00
a822c48565 Added accelerated pick-set checkerboard functions 2021-09-24 17:13:25 +01:00
014fb76e88 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-09-24 16:45:25 +01:00
30e5311b43 Update from the gods upstream 2021-09-24 16:39:56 +01:00
67e08aa952 New file not run yet 2021-09-23 23:39:55 +02:00
ed1f20f3a1 Merge pull request #367 from mmphys/bugfix/H5NS
Hdf5 namespace
2021-09-23 12:36:11 -04:00
cffc736bb3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-22 06:03:06 -07:00
c0d56a1c04 Perlmutter tune up 2021-09-22 06:02:34 -07:00
3206f69478 SYCL happy 2021-09-21 18:01:35 -07:00
b2ccaad761 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 12:18:05 -07:00
8eb1232683 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 09:25:07 -07:00
c6ce3ad03b Some properties 2021-09-21 09:20:21 -07:00
b3b033d343 Clean 2021-09-21 09:18:54 -07:00
ca9816bfbb Typo 2021-09-21 04:12:04 +02:00
814d5abc7e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 04:05:51 +02:00
a29122e2bf Rebench 2021-09-21 04:05:04 +02:00
e188c0512e Udpdate 2021-09-21 01:04:30 +02:00
1fb6aaf150 Device 2 Device with cudaMemcpy 2021-09-21 01:03:07 +02:00
894654f7ef Simplificatoin, always gather faces 2021-09-21 01:02:34 +02:00
109507888b Option to force use of MPI over Nvlink 2021-09-21 00:53:25 +02:00
68650b61fe Options controlling behaviour 2021-09-21 00:51:01 +02:00
7ee66bf453 Make sure H5NS has empty definition if HDF5 built without C++ namespace. Add comment in Hdf5IO.cc indicating likely source of error using H5NS, i.e. lack of --enable-cxx in hdf5 configure. 2021-09-19 19:45:20 +01:00
8bd70ad8b5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-16 10:22:38 -07:00
af98525766 Merge pull request #359 from paboyle/feature/serialisation-update
Feature/serialisation update
2021-09-16 10:24:52 -04:00
1c2f218519 Merge pull request #360 from pjgeorg/ld-nvcc-openmp
nvcc: Add -fopenmp to LDFLAGS
2021-09-16 10:24:30 -04:00
c9aa1f507c Merge pull request #363 from felixerben/feature/testMesonField
Feature/test meson field
2021-09-16 10:23:58 -04:00
ea7126496d Merge pull request #361 from edbennett/fix-setdevice-message
make message about setdevice consistent with configure script
2021-09-16 10:23:37 -04:00
f660dc67e4 Merge pull request #366 from lehner/feature/gpt
Avx512 mixed prec
2021-09-15 20:27:13 -04:00
ede8faea74 Merge branch 'paboyle:develop' into feature/gpt 2021-09-16 02:23:15 +02:00
1b750761c2 Merge pull request #26 from waterret/feature/gpt
AVX512 drop mixed precision as well
2021-09-16 02:22:52 +02:00
145acf2919 Perf results 2021-09-16 01:06:28 +01:00
cc4a27b9e6 Scripts and performance 2021-09-16 00:15:35 +01:00
b4690e6091 Adding build basics for different systems 2021-09-16 00:00:38 +01:00
4b24800132 AVX512 drop mixed precision as well 2021-09-15 16:29:47 -04:00
9d2238148c Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-09-15 19:25:57 +01:00
c15493218d Two extra routines to break out SchurRedBlack on many RHS into stages to allow efficient deflation & split grid
Split grid solver still to do.
2021-09-15 19:24:39 +01:00
001a556a34 Merge pull request #365 from lehner/feature/gpt
Sync
2021-09-15 13:34:02 -04:00
3d0f88e702 A64FX drop mixed precision as well 2021-09-15 18:38:32 +02:00
dd091d0960 consistent pointer offloading instead of views 2021-09-15 16:58:05 +02:00
e2abbf9520 Merge pull request #25 from paboyle/develop
Sync
2021-09-15 10:02:43 +02:00
c7baeb5bae Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-14 08:31:11 -07:00
402d80e197 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-09-14 16:16:06 +01:00
86e33c8ab2 Significant GPU perf speed up finished 2021-09-14 16:14:23 +01:00
5dae6a6dac Deprecate half prec comms 2021-09-14 15:06:59 +01:00
361bb8a101 Remove half prec comms 2021-09-14 15:06:29 +01:00
7efdb3cd2b Remove half prec comms 2021-09-14 15:06:06 +01:00
65ef4ec29f Move tables to device memory 2021-09-14 15:05:01 +01:00
d5835c0222 Switch to coalesced stencil face gather 2021-09-14 15:04:14 +01:00
a7b943b33e Remove half prec comms 2021-09-14 05:05:33 +01:00
7440cde92f No half prec comms; coalesced access on GPU 2021-09-14 05:04:56 +01:00
0fc662bb24 Dirac cuda 11.4 happy ; force host for functions accessing mult table
ET runs these on host BEFORE lodging result in AST for kernel
2021-09-14 05:00:44 +01:00
8195890640 Force MPI over NVLINK 2021-09-14 05:00:17 +01:00
4c88104a73 Fix compile warns 2021-09-11 23:08:05 +01:00
73b944c152 Drop half prec comms for now. 2021-09-11 23:07:18 +01:00
d1b0b7f5c6 Half prec comms dropping 2021-09-11 23:05:40 +01:00
381d8797d0 Drop half prec comms for now 2021-09-11 23:05:02 +01:00
11ee8a1061 Merge remote-tracking branch 'upstream/develop' into develop 2021-09-02 16:57:42 +01:00
b06526bc1e Comment update 2021-08-30 21:15:39 -04:00
3044419111 Some sample code 2021-08-30 20:32:11 -04:00
bcfa9cf068 Improvement of output 2021-08-28 08:08:15 -07:00
114920b8de Some example clean up 2021-08-25 12:24:17 +01:00
0d588b95f4 Bug fix to Example_Laplacian test 2021-08-23 23:14:26 +01:00
5b3c530aa7 Return value 2021-08-23 15:30:45 +01:00
c6a5499c8b Fail on non-apple 2021-08-22 18:40:55 +01:00
ec9c3fe77a Remove the file 2021-08-22 18:28:39 +01:00
6135ad530e Extra examples / solutions 2021-08-22 18:25:07 +01:00
40098424c7 Examples 2021-08-22 14:17:12 +01:00
7163b31a26 Examples 2021-08-20 01:15:23 +01:00
ffbdd91e0e Apple happiness 2021-08-20 01:15:00 +01:00
5d29e175d8 Typo fix 2021-08-10 18:25:43 +01:00
417dbfa257 Fix 2021-08-10 08:55:35 -07:00
1eda4d8e0b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-08-10 05:41:18 -07:00
50181f16e5 Level 0 IPC set up 2021-08-10 05:35:15 -07:00
75030637cc Improved comms benchmark, same as benchmark_comms_host_device 2021-08-10 05:16:30 -07:00
fe5aaf7677 Make comms benchmark same as Benchmark_comms_host_device 2021-08-09 04:06:30 -07:00
80ac2a73ca Check is wrong (HtoD / DtoH) 2021-08-05 18:33:20 -04:00
770680669d Whitespace removal. 2021-08-04 09:21:59 +01:00
0cdfc5cf22 Merge remote-tracking branch 'upstream/develop' into develop 2021-07-30 14:40:55 +01:00
d75a66a3e6 test done 2021-07-06 11:42:36 +01:00
fcc4374d7b i/o done 2021-07-05 14:52:00 +01:00
67c3c16fe5 working test 2021-07-05 14:41:52 +01:00
25e9be50b5 created test file 2021-07-02 15:51:19 +01:00
428b8ba907 Updated from upstream and added halo benchmark 2021-06-29 01:05:12 +01:00
323cf6c038 make message consistent with configure script 2021-06-23 17:00:43 +01:00
29a22ae603 Simpler SYCL setup 2021-06-22 17:57:20 +00:00
403bff1a47 Force reqd subgroup size fo SYCL 2021-06-22 17:56:10 +00:00
c50f27e68b Make FFT play nice with split grid 2021-06-20 11:34:38 +02:00
80afacec5b nvcc: Add -fopenmp to LDFLAGS 2021-06-17 13:05:13 +02:00
6cd9224dd7 SYCL comms buffer allocate 2021-06-16 17:10:55 +00:00
4bf8196ff1 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-06-15 21:45:36 +00:00
4c5440fb06 const happy for sycl 2021-06-15 21:45:07 +00:00
a269a3d919 Merge pull request #358 from mmphys/feature/serialisation-test
Add a ragged std::vector to the serialisation test
2021-06-09 10:16:25 +01:00
0c4f585496 Test nested std::vector<grid tensor> 2021-06-08 00:05:35 +01:00
33d2df46a0 Merge branch 'develop' into feature/serialisation-test
* develop:
  Update README.md
  removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)
2021-06-07 23:25:38 +01:00
2df308f649 Add a ragged vector to the serialisation tests. NB: Already had nested (regular) std::vector<std::vector<...>> 2021-06-07 23:25:07 +01:00
92def28bd3 Update README.md 2021-06-06 04:52:05 -04:00
ca10bfa1c7 removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore) 2021-06-04 11:12:22 +01:00
298a6ec51e Merge pull request #357 from mmphys/bugfix/ragged
Bugfix/ragged Multi-dimensional ragged vectors
2021-06-04 10:34:46 +01:00
e5dbe488a6 Merge branch 'develop' into bugfix/ragged
* develop:
  Remove synch
2021-06-03 08:25:56 +01:00
0e27e3847d Remove synch 2021-06-03 04:24:19 +00:00
393727b93b Documentation update (briefly) covering serialisation changes. For review 2021-06-01 15:49:37 +01:00
2b1fcd78c3 Fixes post review with Peter: a) Correct bug in isRegularShape - detect 3d matrix where 1st slice is 2x2 and second slice is 2x1; b) Synchronisation of EigenResizeCounter done by checking we're the OMP primary thread; c) Move definition of EigenResizeCounter to new file, BaseIO.cc 2021-05-31 22:24:54 +01:00
0a4e0b49a0 BaseIO: Added "EigenResizeCounter" to keep track of any allocations/deallocations to Eigen tensors during readback. On read, if the tensor is resized, EigenResizeCounter += delta memory (in bytes) 2021-05-31 12:49:56 +01:00
76af169f05 Add global namespace to Writer<T> and Reader<T> inside GRID_SERIALIZABLE_CLASS_MEMBERS (so that "using Grid" not necessary).
Fix issue with output of Grid::iMatrix so that M<3>{{148,149,150,} {151,152,153,} {154155156}} becomes M<3>{{148,149,150} {151,152,153} {154,155,156}}
2021-05-31 08:43:02 +01:00
7b89232251 Extended HDF5 serialisation of std::vector<T> where T now also includes Grid scalar/vector/matrix
Changed VectorUtils element traits to is_flattenable, because: a) contract changed on what it does; and b) no other Grid dependencies on element. Needs review.
Initial tests work ... needs proper regression testing.
2021-05-30 20:27:53 +01:00
b5aeae526f Make Cshift fields static to avoid repeated reallocaate overhead 2021-05-28 16:33:08 +02:00
ef0ddd5d04 std::vector serialisation in hdf5 uses a different format if the vector is ragged. When reading back std::vector we need to check which format we're reading (since we don't know a priori) and this involves looking for attributes that may not exist. The c++ API: a) throws; and b) prints voluminous logging. Switched to non-throwing, non-logging, C version of the API after code review. 2021-05-24 18:43:55 +01:00
9b73dacf50 First row might still be ragged if multi dimensional. attrExists() doesn't throw, but easier to wrap in try ... catch than to explain in comment. 2021-05-22 04:34:32 +01:00
244b4aa07f Serialise std::vector of numeric types as multidimensional object if size is regular ... or individually if ragged 2021-05-21 20:08:56 +01:00
8cfc7342cd staggered hand unroll read coalesce 2021-05-05 14:17:18 -07:00
15ae317858 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-05-04 08:40:38 -07:00
834f536b5f Fastest option on SyCL is now std::complex 2021-05-04 08:40:18 -07:00
c332d9f08b Merge pull request #356 from felixerben/bugfix/stoutSmearing
Jamie's fix
2021-04-27 14:10:49 -04:00
cf2923d5dd Jamie's fix 2021-04-27 16:53:37 +01:00
0e4413ddde Merge pull request #355 from felixerben/bugfix/stoutSmearing
bugfix 3D stout smearing
2021-04-27 08:01:55 -04:00
009ccd581e bugfix 3D stout smearing 2021-04-26 10:36:33 +01:00
8cd4263974 Tests compile 2021-04-25 22:20:37 -04:00
d45c868656 Change interface 2021-04-25 10:53:34 -04:00
955a8113de Expose label only to reduce number of parameters 2021-04-25 10:36:38 -04:00
dbe210dd53 Open the ens_id 2021-04-25 10:25:59 -04:00
54c6b1376d Quick fix of conserved current implementation in CayleyFermion5D. Now function treats current insertion with appropriate periodic boundary conditions in the mu=3 direction. 2021-04-21 16:56:46 +01:00
86e11743ca set twists 2021-04-20 10:19:11 -04:00
f3f11b586f Tadpole sign now in front of forward hopping term to be consistent with previous implementation and analytic form. 2021-04-17 12:44:27 +01:00
8083e3f7e8 Sign factor for tadpole implementation corrected. 2021-04-15 11:14:31 +01:00
980e721f6e Update MetaData.h 2021-04-13 09:33:01 -04:00
364793154b Reverted checkerboard changes 2021-04-09 15:47:17 +01:00
3e2ae1e9af Added profiling messages to pick and set checkerboard functions 2021-04-08 16:58:47 +01:00
d38ae2fd18 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-06 17:18:39 +01:00
030e7754e4 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-06 17:16:13 +01:00
e2a0142d87 Merge pull request #348 from AndrewYongZhenNing/develop
Conserved Tadpole Implementation for Shamir Action Only
2021-04-06 10:49:00 -04:00
895244ecc3 Merge with upstream; implemented conserved tadpole for Shamir action. 2021-04-06 13:46:33 +01:00
addeb621a7 Implemented tadpole operator for Shamir action. 2021-04-06 13:45:37 +01:00
3b7fce1e76 Reverted checkerboard changes 2021-04-02 14:38:41 +01:00
4d15417f93 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-01 18:28:15 +01:00
ab3c855f65 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-01 18:22:05 +01:00
92e2c517d8 Changed pick- and setCheckerboard to use accelerator_for 2021-04-01 18:21:19 +01:00
a7fb25adf6 Make Cshift fields static to avoid repeated reallocaate overhead 2021-03-29 21:44:14 +02:00
e947992957 Improved force terms 2021-03-29 20:04:06 +02:00
bb89a82a07 Staggered coalseced read 2021-03-29 20:01:15 +02:00
2bb374daea hip-friendly 2021-03-19 11:33:23 +01:00
8bdadbadac Cold start 2021-03-18 15:41:14 -04:00
15c50a7442 Explicit instantiate the template function 2021-03-18 15:40:42 -04:00
49b0af2c95 Update of tests to compile with the sRNG addition.
Audited the code conventions (again) with the CPS momentum denominator
and added anti periodic in time to the Test_mobius_force.cc and
tested the Test_dwf_gpforce.

Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi

with the same pdot and Udot as audited in the Integrator.h etc...

With full comments and sources for factors.
2021-03-18 09:10:02 -04:00
9c2b37218a sRNG parameter added 2021-03-18 06:24:11 -04:00
3c67d626ba Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 15:36:55 +01:00
51f506553c Read out the local ID once, and store 2021-03-12 15:33:04 +01:00
226be84937 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 09:31:50 -05:00
001814b442 updated to do list. Start adding DDHMC work items 2021-03-12 09:31:17 -05:00
db3ac67506 Update thread issue 2021-03-12 14:55:07 +01:00
da91a884ef NVCC versions found buggy added as guard 2021-03-11 23:54:53 +01:00
a71e6755e3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-11 22:43:06 +01:00
cd5891eecd Test that fails on Cuda 11.0 2021-03-11 22:34:28 +01:00
5bb7336f27 Merge pull request #347 from pjgeorg/fix-autotools-avx512
Fix inconsistent SIMD option AVX512

Thanks
2021-03-11 16:29:07 -05:00
ce1fc1f48a Possible fallback plan for Fionn's compiler bbug in nvcc 2021-03-11 22:20:53 +01:00
82402c6a7c Add simd option SKL for ICC 2021-03-11 13:08:40 +01:00
d9c4afe5b7 Fix inconsistent configure option AVX512
Before this change AVX512 enabled different instruction sets depending
on the compiler:

For Intel C++ Compiler Classic (ICC):
    AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL
    i.e. Intel Xeon Skylake and newer

For Intel ICX, gcc, clang:
    AVX512F, AVX512CD, AVX512ER, AVX512PF
    i.e. Intel Xeon Phi x200/x205 (KNL/KNM)

With this commit AVX512 now only enables the common instruction sets
supported by all CPUs supporting any AVX-512 instructions set:
AVX512F and AVX512CD (called COMMON-AVX512 by icc)
2021-03-11 12:58:49 +01:00
f786ff8d69 Extend test from Fionn, fails on A100 apparently 2021-03-10 14:32:06 -05:00
a651caed5f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-10 06:23:51 -08:00
0e21adb3f6 Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly.
Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp
evolves a central nervous systems.
2021-03-10 05:40:51 -08:00
58bf9b9e6d Clean up test 2021-03-10 02:45:22 +01:00
2146eebb65 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-09 04:31:46 +01:00
6a429ee6d3 2d loop hits Nvidia 16bit limit on large local vols 2021-03-09 04:31:10 +01:00
4d1ea15c79 More verbosity. The 16bit limit on Grid.y, Grid.z is annoying 2021-03-09 04:29:37 +01:00
a76cb005e0 Update Tensor_exp.h 2021-03-08 13:37:57 -05:00
49ecbc81d4 Merge pull request #24 from ThomasWurm/feature/gpt
Put GlobalSum outside the slice loop in sliceSum
2021-03-08 16:01:47 +01:00
9e5fb52eb9 Put GlobalSum outside the slice loop 2021-03-08 13:53:34 +01:00
a9604367c1 Merge pull request #336 from lehner/feature/gpt
Make ShmDims configurable; adjust GRID_MAX_SIMD to allow for 128 byte width on GPUs
2021-03-05 13:17:19 -05:00
d7065023cc Merge pull request #332 from mmphys/feature/mres_schur
Optional changes to Test_cayley_mres e.g. Schur solver
2021-03-05 12:47:07 -05:00
89d299ceec Merge pull request #333 from mmphys/bugfix/LatTransfer
Fix convertType for GPU in Lattice_transfer.h
2021-03-05 12:46:33 -05:00
e34eda66df Merge pull request #344 from felixerben/feature/XiToSigma
Feature/xi to sigma
2021-03-05 12:45:44 -05:00
b24181aa4f Update Coordinate.h
Revert GRID_MAX_SIMD change
2021-03-05 16:56:58 +01:00
aa173e2998 Update README.md 2021-03-05 10:25:33 -05:00
7a19432e0b whitespace 2021-03-05 10:57:09 +00:00
9b15704290 tested and consitent 2021-03-05 10:42:32 +00:00
017f955b2d Merge branch 'develop' into feature/mres_schur
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:42:02 +00:00
f252d69eef Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:41:30 +00:00
3b06e4655e Merge branch 'develop' into feature/XiToSigma 2021-03-04 20:06:16 +00:00
d4b4de8f42 changes 2021-03-04 20:01:24 +00:00
c90beee774 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-03 23:50:29 +01:00
1eea9d73b9 Pass serial RNG around 2021-03-03 23:50:01 +01:00
679d1d22f7 Sycl happier 2021-03-03 11:21:43 -08:00
b2b5e0b98c Merge branch 'develop' into feature/mres_schur
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
  Better SIMD usage/coalescence
2021-03-03 16:15:12 +00:00
03e54722c1 Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
2021-03-03 16:13:23 +00:00
442336bd96 Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case.
Other cases to do. This now includes comms code path.
2021-03-02 14:50:51 +01:00
9c9566b9c9 Merge pull request #23 from paboyle/develop
Sync
2021-03-01 12:33:51 +01:00
1059a81a3c Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Better SIMD usage/coalescence
2021-02-27 00:21:36 +00:00
2e61556389 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-02-26 17:52:20 +01:00
f9b1f240f6 Better SIMD usage/coalescence 2021-02-26 17:51:41 +01:00
69f41469dd Merge branch 'develop' into bugfix/LatTransfer
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-25 09:19:17 +00:00
d620b303ff Merge branch 'develop' into feature/mres_schur
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-24 18:07:27 +00:00
157fd1428d Merge pull request #342 from paboyle/feature/link-update-mask
Feature/link update mask
2021-02-24 11:29:52 -05:00
c791cb2214 Merge branch 'develop' into feature/link-update-mask 2021-02-23 11:51:54 -05:00
d5ab571a89 Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces
Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC
Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
2021-02-23 11:49:56 -05:00
0ed800f6e4 merge develop 2021-02-23 14:54:46 +00:00
0a32183825 Merge pull request #335 from felixerben/gpu/baryons
Gpu/baryons
2021-02-23 09:30:16 -05:00
2cacfbde2a Merge pull request #341 from DanielRichtmann/fix/minor-things
Minor fixes
2021-02-22 09:28:50 -05:00
c073e62e0b Correct misleading ac help string 2021-02-22 15:25:44 +01:00
e3d019bc2f Enable performance counting in WilsonFermion like in others 2021-02-22 15:25:40 +01:00
7ae030f585 changed back A2AUtils warning 2021-02-18 13:24:50 +00:00
86b58d5aff changed if and accelerator_for - no runtime errors any more 2021-02-18 12:04:32 +00:00
26e8b9f4a5 Merge pull request #340 from mmphys/bugfix/config
Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
2021-02-17 11:56:21 -05:00
35114c9e62 Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu 2021-02-17 13:24:15 +00:00
dfd28a85c9 Merge pull request #339 from mmphys/bugfix/config
Optional rename PACKAGE_ to GRID_ in Grid/Config.h
2021-02-15 13:53:26 -05:00
a503332924 Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working.
Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
2021-02-14 21:27:54 +00:00
1ac13ec3a7 Merge pull request #338 from paboyle/bugfix/maxnorm2
Fixed compile issues with maxLocalNorm2 for non-scalar lattices
2021-02-08 12:08:11 -05:00
55de69a569 Fixed compile issues with maxLocalNorm2 for non-scalar lattices
maxLocalNorm2 test now reuses the random field
2021-02-08 12:03:16 -05:00
eda9ab487b MADWF 5d source option for hadrons - look at Grid of source
Abort on GPU error
2021-02-08 10:47:22 -05:00
cd99edcc5f maxLocalNorm2() 2021-02-04 18:25:49 -05:00
4705aa541d Allow user to configure ShmDims via environment variables 2021-02-04 14:25:55 +01:00
3215d88a91 Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if 2021-02-03 15:17:03 +00:00
9b9a53f870 ... 2021-02-02 13:06:43 +00:00
019ffe17d4 Allow for GPU vector width beyond 64 2021-02-02 11:32:23 +01:00
bc496dd844 change back benchmark_ITT 2021-01-28 14:29:56 +00:00
a673b6a54d prettify 2021-01-28 14:15:09 +00:00
1bf2e4d187 Merge branch 'develop' into gpu/baryons 2021-01-27 21:17:37 +00:00
96dd7a8fbd Flop cout matches DiRAC-ITT-2020 2021-01-27 21:14:52 +00:00
7905afa9f5 revert changes 2021-01-27 21:14:52 +00:00
712bb40650 merge develop 2021-01-27 21:14:52 +00:00
81d88d9f4d fixes 2021-01-27 21:09:51 +00:00
77063418da Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex<T>. This removes many warnings in Hadrons
Simplify the SFINAE syntax and correct convertType for iScalar
2021-01-25 15:09:36 +00:00
2983b6fdf6 Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided 2021-01-23 12:41:48 +00:00
69f1f04f74 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-01-21 21:39:59 -05:00
11a5fd09d6 Hot config 2021-01-21 21:39:41 -05:00
ff1fa98808 Fix for GPU conserveed current 2021-01-21 21:38:23 -05:00
df16202865 weird bug in 2pt function... 2021-01-19 19:25:27 +00:00
3ff7c2c02a Merge branch 'develop' into gpu/baryons 2021-01-19 12:34:13 +00:00
fc6d07897f revert changes 2021-01-19 12:32:48 +00:00
f9c8e5c8ef Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-01-19 12:30:29 +00:00
8bfa0e74f8 final version, tested on CPU and GPU 2021-01-19 12:27:57 +00:00
9b73a937e7 bugfix 2021-01-18 18:57:05 +00:00
b0339bc5a4 Merge branch 'feature/conjugate-bc-dirs' into develop 2021-01-15 09:28:39 -05:00
3c23a947cc Fixed test for very much non-unit det 2021-01-15 09:16:02 -05:00
56111bb823 Merge branch 'develop' into feature/conjugate-bc-dirs 2021-01-14 21:01:22 -05:00
99445673f6 Gparity fix, and plaquette IO 2021-01-14 21:00:36 -05:00
97a59643f7 Red black coarse space 2021-01-14 20:49:13 -05:00
579595f547 Red black on coarse space 2021-01-14 20:48:35 -05:00
281ac5fc12 Red black support on coars 2021-01-14 20:48:08 -05:00
d8fa903b02 G5 on coarse spaces 2021-01-14 20:47:28 -05:00
eaff0f3aeb Gamma5 on coaree spaces 2021-01-14 20:46:58 -05:00
e8e20c01b2 Coarsened vector test 2021-01-14 20:46:21 -05:00
a4afc3ea2a Red black coarse space 2021-01-14 20:44:16 -05:00
fa12b9a329 bugfix 2021-01-13 10:04:17 +00:00
45fc7ded3a test for sum 2021-01-12 09:10:37 +00:00
74de2d9742 whitespace changes 2021-01-08 18:28:36 +00:00
e759367d42 tested and working 2021-01-08 18:04:50 +00:00
299d0de066 Merge pull request #21 from paboyle/develop
Sync
2020-12-22 20:59:15 +01:00
3fe75bc7cb Merge pull request #329 from nmeyer-ur/feature/a64fx-3
Revised dslash/dwf kernels for A64FX
2020-12-20 08:17:15 -05:00
45d49d8648 clean up 2020-12-19 03:35:18 +01:00
6013183361 removed Asm impls 2020-12-19 03:25:01 +01:00
4b882e8056 fixed lost bracket 2020-12-19 03:09:20 +01:00
3f9ae6e7e7 Merge branch 'develop' into feature/a64fx-3 2020-12-19 02:37:11 +01:00
909acd55cd vnum variant for prefetches 2020-12-19 02:00:22 +01:00
4dd9e39e0d up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 2020-12-19 00:54:31 +01:00
b4c1317ab4 Merge pull request #22 from DanielRichtmann/feature/clover-access-specifier
Clover access specifier
2020-12-18 16:20:19 +01:00
f36d6f3923 compiles on GPU. 3pt still wrong!!!! 2020-12-17 17:04:08 +00:00
7adb253e25 Merge pull request #328 from mmphys/feature/mrespatch
Enable existing conserved current code for CUDA
2020-12-17 11:10:29 -05:00
808f1e0e8c merge develop 2020-12-15 16:33:29 +00:00
873519e960 Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration 2020-12-14 16:06:10 +00:00
9aec4a3c26 SYCL 2020-12-10 02:11:17 -08:00
c438118fd7 Change access specifier of clover fields in order to allow deriving classes to access these 2020-12-08 14:42:11 +01:00
70510d151b Merge pull request #327 from paboyle/feature/gparity_twist_GPU
Feature/gparity twist gpu
2020-12-07 12:02:20 -05:00
9e7bacb5a4 Merge branch 'develop' into feature/gparity_twist_GPU 2020-12-07 11:55:39 -05:00
2ef1fa66a8 Improved performance of G-parity kernel for GPUs by simplifying multLink implementation 2020-12-07 11:53:35 -05:00
cf76741ec6 Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 ) 2020-12-03 03:47:11 -08:00
497e7c1c40 Duplicate code 2020-12-02 17:55:30 -08:00
888eacd3b8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-24 21:46:33 -05:00
321f0f51b5 Project to SU(N) 2020-11-24 21:46:10 -05:00
17ec9c5545 Merge pull request #20 from paboyle/develop
Sync
2020-11-24 12:20:43 +01:00
30ad9578a2 Merge branch 'lehner-feature/gpt' into develop 2020-11-24 06:10:24 -05:00
9dce101586 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into lehner-feature/gpt 2020-11-24 06:10:16 -05:00
97e264d0ff Christoph's changes 2020-11-23 15:46:11 +00:00
683a5e5bf5 Stencil use host vector for integera table on enable-shared=no and mirror it on device 2020-11-23 15:39:51 +00:00
d4861a362c Stencil use non-UVM memory for look up table on enable-shared=no 2020-11-23 15:38:49 +00:00
5ff3eae027 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-20 13:14:44 -05:00
147dc15d26 Update 2020-11-20 13:13:59 -05:00
c61ea72949 Merge pull request #19 from paboyle/develop
Sync
2020-11-20 17:31:13 +01:00
86e8b9fe38 ALLOC_ALIGN removed 2020-11-20 17:07:16 +01:00
612e468889 Configurable ALLOC_ALIGN and ALLOC_CACHE 2020-11-20 16:48:28 +01:00
4ea8d128c2 Merge pull request #18 from paboyle/develop
Sync
2020-11-20 15:36:50 +01:00
e49b7f2f88 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-19 19:24:41 +01:00
aace3d47b9 partial work in progress 2020-11-19 19:24:14 +01:00
d5049949a4 Starting to fix reunitarise 2020-11-19 19:23:41 +01:00
f1c7480e3c Warning remove 2020-11-19 19:23:03 +01:00
5adae5d6ff Unused variable remove 2020-11-19 19:22:12 +01:00
a8412ace05 Merge pull request #317 from i-kanamori/develop
adding an error check for input: Parameters.StartingType
2020-11-18 23:09:40 -05:00
9fd1c2ad4b Merge pull request #325 from DanielRichtmann/feature/threaded-clover-inversion
Threaded clover term inversion
2020-11-18 23:08:37 -05:00
4cf3575353 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-18 03:07:36 +00:00
804a810d68 Wildcard mismatch 2020-11-18 03:06:53 +00:00
8fcb392e24 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-17 04:51:31 -08:00
dd8d70eeff Build without LIME 2020-11-17 04:41:15 -08:00
aa8aba6543 --shm-force-mpi 2020-11-16 20:15:50 -05:00
13df14f96e Switch off SHM paths with --disable-shm 2020-11-16 18:07:15 -05:00
3aab983760 Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) 2020-11-16 17:13:58 +01:00
9c4dcc5ea3 Merge branch 'master' into develop 2020-11-16 16:34:57 +01:00
a1063ddbb9 Update options and simplify 2020-11-13 04:11:03 +01:00
18ef8056ec Hide Shared Memory 2020-11-13 04:10:40 +01:00
1c673977fa Must ask for COMMMS_THREADS 2020-11-13 03:59:36 +01:00
e9bc748828 Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel 2020-11-13 03:58:34 +01:00
f48156529b Work on 2,2,2,8 ranks 2020-11-13 03:57:58 +01:00
d05ce01809 TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED 2020-11-13 03:52:19 +01:00
cf23eff60e Device to Device, Memset, cannot assume UVM == Communicable 2020-11-13 03:51:08 +01:00
6e313575be Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac 2020-11-13 03:50:16 +01:00
b13d1f7238 TOFU compat flag to help Isaaku 2020-11-13 03:49:44 +01:00
b5e7945dd9 Option for host or device Cshift implementation 2020-11-13 01:38:54 +01:00
7535566f54 Option for bounce through the SHM buffer 2020-11-12 22:54:27 +01:00
50b808ab33 Configure option between host and device 2020-11-12 22:28:12 +01:00
f16c2665f5 Host memory explict 2020-11-12 20:29:58 +01:00
41e28015ae Volume divisible guarantee 2020-11-07 13:32:16 +01:00
3594ce877b speedup in Sigma-to-nucleon 2020-11-03 20:04:30 +00:00
9bae6b889a speedup in Sigma-to-nucleon 2020-11-03 20:03:09 +00:00
4014dfd5b9 first tested version 2020-11-03 16:13:08 +00:00
67023c334b bugfix 2020-11-03 13:07:37 +00:00
a3de7026c8 bugfix 2020-11-03 12:51:50 +00:00
ee11678b1f added Xi-to-Sigma rare decays 2020-11-03 12:41:35 +00:00
a0ccbb3bd6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-01 01:16:35 +00:00
5eeabaa2bb HIP fix 2020-11-01 01:16:01 +00:00
00d0d6d008 Hip Free managed 2020-10-31 18:14:31 -04:00
537a9f7030 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-31 18:13:30 -04:00
cc9c993f74 Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt 2020-10-31 18:12:47 -04:00
d10422ded8 Test project on group 2020-10-31 18:12:30 -04:00
f313565a3c HiP compile 2020-10-31 12:12:40 +00:00
b3881d2636 Thread inversion of clover term 2020-10-30 16:18:58 +01:00
61d5860b46 Merge pull request #318 from rrhodgson/feature/BaryonSpinMat
Added untraced baryon contraction code
2020-10-28 18:39:59 +00:00
52d17987dc BaryonUtils.h updated debug output 2020-10-23 11:41:08 +01:00
19d8bba97d BaryonUtils function naming change 2020-10-21 11:58:53 +01:00
463d72d322 Added untraced baryon contraction code 2020-10-19 16:13:28 +01:00
d060341168 add an error check for Parameters.StartingType 2020-10-16 21:39:17 +09:00
c772bcd514 Merge https://github.com/paboyle/Grid into develop 2020-10-16 20:30:32 +09:00
3362f8dfa0 happy compile 2020-10-14 22:59:41 -04:00
bf3c9857e0 Closure changes 2020-10-14 21:37:14 -04:00
a88b3ceca5 Closure cases 2020-10-14 21:33:51 -04:00
aa135412f5 toComplex, toReal 2020-10-13 22:25:01 -04:00
9945399e60 Reaality issues fix by drop from ET 2020-10-13 22:24:32 -04:00
5eeffa49e8 Reality forced included 2020-10-13 22:23:57 -04:00
3f06209720 Pretty print 2020-10-13 22:18:51 -04:00
12e239dd9f Merge branch 'release/dirac-ITT-2020' 2020-10-13 13:38:29 -04:00
af2301afbb Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2020-10-13 11:42:12 -04:00
f98856a26f Merge pull request #314 from smangham/issue_readme_precision
Fix for deprecated configure options in documentation (issue #313)
2020-10-13 11:41:38 -04:00
d55cc5b380 Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision. 2020-10-12 12:33:13 +01:00
c2b688abc9 Benchmark_IO: reducing max local volume to 32^4 2020-10-10 16:52:56 +01:00
b0d61b9687 Benchmark_IO cleaner output 2020-10-09 21:46:45 +01:00
5f893bf9af Benchmark_IO procurement sizes 2020-10-09 21:31:59 +01:00
0e17bd6597 I/O benchmark cleanup 2020-10-09 20:29:57 +01:00
22caa158cc multi-pass I/O benchmark, with statistic and robustness summary 2020-10-09 20:29:40 +01:00
b24a504d7c hook to access last parallel I/O performance measurement 2020-10-09 20:28:54 +01:00
992ef6e9fc more runtime 2020-10-08 22:19:20 -04:00
f32a320bc3 Single prec benchmark in double prec compile 2020-10-08 19:52:08 -04:00
5f0fe029d2 Improve meemory benchmarks for GPU (avoid host mem ping pong) 2020-10-08 19:51:28 -04:00
6b1486e89b fixing number of colours defaulting to 4 in most cases 2020-10-08 16:31:24 +01:00
3f9c427a3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-07 13:12:57 -04:00
d201277652 Expose Nc as a compile time configure option.
Remove precision option
2020-10-07 13:07:00 -04:00
fdda7cf9cf Merge branch 'feature/benchmark-io-update' into develop 2020-10-07 15:57:53 +01:00
e22d30f715 Merge branch 'develop' into feature/benchmark-io-update 2020-10-07 15:56:39 +01:00
1ba25a0d8c more I/O benchmark code cleaning 2020-10-07 15:38:41 +01:00
9ba3647bdf script to convert I/O benchmark logs to CSV 2020-10-07 15:35:03 +01:00
5ee832f738 I/O benchmark code cleaning 2020-10-07 15:31:51 +01:00
467deee46f Merge branch 'debug_512' into develop 2020-10-07 15:18:44 +09:00
35a69a5133 SU4 x SU4 2020-10-06 21:48:35 -04:00
e9c5a271a8 fixing potential issues with log alignment and timer I/O 2020-10-06 17:58:16 +01:00
acac2d6938 standard C/C++ I/O in benchmark 2020-10-06 17:57:00 +01:00
97db2b8d20 add reordring of random number generator in IO 2020-10-06 17:25:59 +09:00
80fd6ab407 Merge pull request #17 from paboyle/develop
sync upstream
2020-10-06 09:01:39 +02:00
5534921bee Merge pull request #16 from DanielRichtmann/feature/gpt-coarsenedmatrix
Enable checkerboard operations for CoarsenedMatrix
2020-10-01 10:55:13 +02:00
ace9cd64bb dpcpp happy 2020-09-29 08:03:46 -07:00
a3e2aeb603 dpcpp options happiness 2020-09-29 06:50:10 -07:00
049dd25785 Revert accidental commit thanks michael 2020-09-23 04:13:50 -04:00
d43d372294 Merge pull request #311 from mmphys/bugfix/MPIasynch
Asynchronous calls removed - reflect this in Communicator_none.cc
2020-09-22 10:41:48 -04:00
b71a081cba Asynchronous calls removed - reflect this in Communicator_none.cc
(Opportunistic doc update - OpenMP support on Mac OS)
2020-09-21 09:33:23 +01:00
c48909590b MPI asynch call removal 2020-09-17 20:47:32 +01:00
446ef40570 HIP IPC 2020-09-17 20:31:46 +01:00
81441e98f4 HIP runs sensible 2020-09-16 03:35:03 +01:00
ecd3f890f5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-16 02:30:14 +01:00
1c881ce23c HIP does not like half2 visible members x and y so must define own Half2 2020-09-16 02:28:33 +01:00
dacbbdd051 Hip Happy Birthday 2020-09-16 00:37:02 +01:00
2859955a03 HIP requires "inline" 2020-09-16 00:36:13 +01:00
cc220abd1d inline for HIP 2020-09-16 00:35:38 +01:00
d1c0c0197e HipCC requires inline on definition 2020-09-16 00:35:06 +01:00
fd9424ef27 innlines required to make HIP happy 2020-09-16 00:34:32 +01:00
a5c35c4024 Make HIP / Vega happy 2020-09-16 00:33:53 +01:00
e03b64dc06 HIP default flaags to work on ROCM 2020-09-16 00:33:09 +01:00
4677c40195 HIP improvements 2020-09-16 00:32:27 +01:00
288c615782 Hip improvements 2020-09-16 00:31:50 +01:00
48e81cf6f8 Hip Pragmas 2020-09-16 00:31:03 +01:00
5cffa05c7e remove slab allocator file 2020-09-13 14:06:25 -04:00
d50a2164d7 remove slab allocator 2020-09-13 14:06:06 -04:00
32ff766dbd fix evict scheme, slab alloc 2020-09-13 14:02:53 -04:00
01652d8cfe SlabAllocator 2020-09-13 05:56:02 -04:00
4d2dc7ba03 Enable even-odd for CoarsenedMatrix 2020-09-11 20:32:02 +02:00
51d1beb1f3 Merge pull request #15 from paboyle/develop
Sync with upstream
2020-09-07 14:20:33 +02:00
65b724bb5f 2 level hddcr 2020-09-03 21:46:43 -04:00
6dbd117aa5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-03 20:30:49 -04:00
198b29f618 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-03 20:29:54 -04:00
a8309638d4 UVM check in MPI calls 2020-09-03 20:29:26 -04:00
f98a4e880e Merge pull request #310 from kostrzewa/accelerator_vector_stream_op_no_backspace
do not use backspace in AcceleratorVector (Coordinate) output stream operator
2020-09-03 20:24:59 -04:00
8244caff25 Remove the asynchronous non-Stencil calls. 2020-09-03 18:52:55 -04:00
bcd7895362 Include cuda.h 2020-09-03 15:49:13 -04:00
85b1c5df39 A never hit case that is not 100% confident is asserted for safety 2020-09-03 15:48:16 -04:00
b4255140d6 Stale data member eliminated 2020-09-03 15:47:46 -04:00
0c3095e173 Comms buffers to device memory 2020-09-03 15:45:35 -04:00
d3ce60713d UVM, Device and Lattice/aligned allocators 2020-09-03 15:44:13 -04:00
eac1f08b7b Close expressions passed as an argument 2020-09-01 15:30:33 -04:00
1654c4f3c0 Closure improved 2020-09-01 15:29:45 -04:00
8807d998bc closure improved 2020-09-01 15:29:11 -04:00
5791021dcd Speed up Cshift more with coalesced 2020-09-01 15:28:15 -04:00
c273fb051c Peek poke laattice 2020-09-01 15:27:59 -04:00
c545530170 little worry large Nbasis doesnt compile GPU 2020-09-01 00:14:33 -04:00
d982a5b6d5 Fix coaarsened 2020-09-01 00:14:04 -04:00
15ca8637f3 No norms in HermOp 2020-09-01 00:13:32 -04:00
cbc995b74c Made better interface 2020-09-01 00:12:54 -04:00
8b74174d74 Eigen tensor serialisatiino happy undeer GPU. Regret agreeing to let us couple Eigen types to Grid IO 2020-09-01 00:03:26 -04:00
e21fef17df real and imag part not in ET 2020-08-31 23:56:26 -04:00
3d27708f07 Basic where test 2020-08-31 23:55:49 -04:00
b918744184 Prettificatoin 2020-08-31 23:54:46 -04:00
7d14a3c086 Where working 2020-08-31 23:53:46 -04:00
e14a84317d GPU math unary calls 2020-08-31 23:50:49 -04:00
6c31b99f1f I knew coupling Eigen Tensor to Grid serialisation was a bad iddea.
Now the complex is different on GPU creates probblems
2020-08-31 23:49:19 -04:00
9522dcd611 Remove dead commented ouot coode 2020-08-31 23:40:29 -04:00
ed469898dc coalesced ET expressions 2020-08-31 23:38:40 -04:00
1eee94a809 Sorting real/im in read coalesced GPU ET 2020-08-31 23:36:49 -04:00
54523369a3 do not use backspace in Coordinate output stream operator 2020-08-31 19:39:36 +02:00
a98c91c2a5 Merge pull request #309 from kostrzewa/format_benchmark_wilson_sweep
Format benchmark wilson sweep
2020-08-31 12:43:46 -04:00
a9b92867a8 use tabulator 2020-08-31 18:41:17 +02:00
65920faeba correct formatting of Benchmark_wilson_sweep output 2020-08-31 18:39:27 +02:00
249e2db87d Merge pull request #14 from DanielRichtmann/feature/gpt-coarsenedmatrix
Expose more functions in CMat
2020-08-27 15:18:56 +02:00
cf3535d16e Expose more functions in CMat 2020-08-27 14:06:48 +02:00
d61ee817f4 Merge pull request #13 from DanielRichtmann/feature/gpt-coarsenedmatrix
Changes needed for GPT MG
2020-08-27 12:11:06 +02:00
3448b7387c Almost there to coalesced ET 2020-08-26 17:04:49 -04:00
47b89d2739 Pragma protection improvementt 2020-08-26 17:04:27 -04:00
2a75516330 state MPI/SLURM message only on world_rank zero 2020-08-26 12:34:17 -04:00
b2087f14c4 Fix CoarsenedMatrix regarding illegal memory accesses
Need a reference to geom since the lambda copies the this pointer which points to host memory, see
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture
- https://devblogs.nvidia.com/new-compiler-features-cuda-8/
2020-08-24 17:46:47 +02:00
dd1ba266b2 Fix mapping between dir + disp and point in CMat 2020-08-24 17:46:46 +02:00
1292d59563 Add a typedef + broaden interface of CMat 2020-08-24 17:46:45 +02:00
9877ed9bf8 Merge pull request #12 from paboyle/develop
Sync
2020-08-22 16:35:35 +02:00
f0dc0f3621 fix compile issue on Qpace3 2020-08-22 13:57:33 +02:00
1efe30d6cc SLurm stop nodes using same GPU 2020-08-21 02:02:53 +02:00
0b787e9fe0 Avoid namespaec collision to make gcc happy 2020-08-20 22:23:29 +02:00
37ec4b241c Default thread count sensible 2020-08-20 22:12:31 +02:00
63b0a19f37 Merge pull request #11 from paboyle/develop
Sync
2020-08-20 20:53:39 +02:00
90ea7dfa99 Accelerator loops for device resident comms buf 2020-08-19 22:40:44 +02:00
f866d7c33e Merge pull request #307 from lehner/feature/gpt
Merged Nils's A64FX and minor fixes (MemoryManager::InitMessage, Tensor_index zeroit, ...)
2020-08-18 23:27:21 -04:00
542bdef198 cleanup comments 2020-08-14 18:39:44 +02:00
06007db3d9 true shm_none implementation with GPUs that disables the use of device shared memory for the stencils 2020-08-14 18:37:00 +02:00
12e6059a70 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2020-08-13 16:16:52 +02:00
dbaa24ebf6 further GPU memory access fixes (with this GPT passes all single-rank tests on non-summit GPUs) 2020-08-13 16:14:15 +02:00
3276aa67dc Update 2020-08-12 14:15:53 -04:00
3b30b9f0c0 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2020-08-06 16:59:17 +02:00
69db4816f7 fix variable capture in Scatter_plane_merge on accelerators 2020-08-06 16:57:16 +02:00
3abe09025a when using SHM_NONE allow multiple ranks per node but without using shared memory 2020-08-06 14:42:38 +02:00
e33878e0de Trigger re-run of CI 2020-08-06 11:50:24 +02:00
27b4fbf3f0 assert for forbidden code path and fix check for faster CPU codepath in basisRotate 2020-08-03 07:57:33 -04:00
968a90633a Zero -> zeroit in Tensor_index 2020-07-31 02:07:17 -04:00
6365a89ba3 create separate InitMessage for MemoryManager that can be called after communicator setup 2020-07-30 07:25:05 -04:00
ddbb008694 Merge pull request #10 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-07-30 13:12:09 +02:00
7997e0a449 Merge branch 'feature/gpt' into feature/gpt-sycl 2020-07-30 13:11:31 +02:00
197612bc7a fast cpu basisRotate and other small cleanups 2020-07-30 07:08:54 -04:00
0e88bf4bff remove Nils's default pragma 2020-07-29 10:24:35 -04:00
3e64d78469 include versions.h again and add back asserts in Test_simd 2020-07-29 10:18:05 -04:00
2004611def Merge pull request #9 from nmeyer-ur/feature/a64fx-2
Feature/a64fx 2
2020-07-29 14:54:20 +02:00
a2868c96a4 Merge pull request #8 from paboyle/develop
Doc recompile
2020-07-29 14:10:07 +02:00
7cf7f11e1a Doc recompile 2020-07-22 14:44:11 -04:00
ea7f8fda5e fix typo 2020-07-22 09:34:05 +02:00
906b78811b exit in Init when using --comms-overlap 2020-07-22 08:57:01 +02:00
97703b181b Merge pull request #7 from paboyle/develop
Merge current develop
2020-07-12 16:24:53 +02:00
d9474c6cb6 compiler-independent build using --enable-simd=A64FX 2020-07-09 10:07:02 +02:00
bbd145382b enable --enable-simd=A64FX in configure 2020-07-08 12:43:51 +02:00
1b08cb7300 Merge branch 'develop' into feature/a64fx-2 2020-07-08 08:18:18 +02:00
337d9dc043 move barrier in Benchmark_wilson 2020-07-08 08:13:40 +02:00
8726e94ea7 merge upstream develop 2020-07-07 20:26:47 +02:00
67db4993c2 reset head, update SVE readme 2020-07-07 19:54:52 +02:00
f1f655d92b Merge pull request #304 from Heinrich-BR/develop
ScalarImpl.h updates
2020-07-06 10:16:03 +01:00
43334e88c3 Tiny change in a comment for clarity 2020-07-04 16:11:16 +01:00
4f1e66b044 Fixed HMC SU(N) integrator which was causing fields to leave Lie Algebra manifold for N>2 2020-07-04 03:53:06 +01:00
fd3c8b0e85 correct build instructions qp4 2020-07-01 09:00:38 +02:00
1635c263ee disable TOFU by default 2020-06-30 19:27:08 +02:00
64fe5b21b4 Merge pull request #298 from rrhodgson/feature/baryon
Update baryon 2pt and add 3pt function
2020-06-29 18:45:00 +01:00
ee9889821d Runs through to coarse space solve 2020-06-29 12:59:52 -04:00
eb470aa6dc Update to baryon and added comments/fix whitespace 2020-06-29 09:43:01 +01:00
77af9a3ddc Baryon revert sign 2020-06-26 10:08:42 +01:00
102089798c BaryonUtils: update to autoView 2020-06-25 16:41:58 +01:00
39cea8b5a7 Merge branch 'develop' into feature/baryon 2020-06-25 16:24:07 +01:00
a65f66d2db Merge branch 'feature/baryon3pt' into feature/baryon 2020-06-25 16:20:59 +01:00
936c5ecf69 Reduction GPU no compile fix 2020-06-24 17:28:31 -04:00
22cfbdbbb3 Boost precision in inner products in single 2020-06-24 12:52:31 -04:00
093d1ee21b Force initial values 2020-06-24 08:54:49 -04:00
d6ba2581ce Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-06-24 08:25:08 -04:00
577c064184 Memory manager initialise earlier 2020-06-24 08:24:38 -04:00
2ff1fa6fad UVM used shared for CPU alloccations andd ddont migrate 2020-06-23 22:14:56 -04:00
70be1bd8be Adding code under development 2020-06-23 10:24:21 -04:00
4ef50ba31f Baryon speedup 2020-06-23 11:44:20 +01:00
3e97a26f90 BaryonGamm3pt threads -> accelerator 2020-06-23 11:35:32 +01:00
599f28f6ef Baryon bug fixes 2020-06-23 11:10:26 +01:00
c48da35921 Memory Vector UVM and Lattice alignedAllocator separate 2020-06-22 20:21:53 -04:00
6c5fa8dcd8 Aligned allocate on CPU put through this interface 2020-06-20 14:34:29 -04:00
0d2f913a1a String.h for linux 2020-06-20 09:37:31 -04:00
5b117865b2 Merge pull request #6 from paboyle/sycl
Sycl
2020-06-20 09:44:44 +02:00
1a74816c25 Hopeefully fixed 2020-06-19 17:50:52 -04:00
73de335256 Merge branch 'develop' into sycl 2020-06-19 17:44:16 -04:00
228fd450ce Typo fix (excusee - my keyboard is starting to break) 2020-06-19 17:36:05 -04:00
b949cf6b12 PeekLocal needs a view to keep thread safe.
ALLOCATION_CACHEE reenable
2020-06-19 17:13:27 -04:00
11bc1aeadc TThread count defaultt to fastest 2020-06-19 14:30:35 -04:00
66005929af Set up the cache size on all ranks 2020-06-19 12:50:54 -04:00
05bbc49a99 Edge case in GetShmDim check 2020-06-19 12:01:23 -04:00
ff7c847735 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-19 01:22:16 -04:00
1aa988b2af Comms overlap fix UVM case 2020-06-19 01:21:14 -04:00
edf17708a8 Range improvement 2020-06-18 22:41:06 -04:00
81a8209749 ConvertType for blockInnerProduct 2020-06-18 11:53:21 -04:00
a87e45ba25 SVE readme update 2020-06-18 11:23:08 +02:00
465856331a switch back to serialized; wrong results on single too 2020-06-15 15:39:39 +02:00
cc958aa9ed switch back to standard MPI_init due to wrong results in Benchmark_wilson using comms-overlap 2020-06-15 14:21:38 +02:00
f46f029dbb Merge pull request #292 from lehner/feature/gpt-sycl
Catch edge case in SharedMemoryMPI::GetShmDims; Change default units …
2020-06-14 13:43:27 -04:00
3dccd7aa2c Catch edge case in SharedMemoryMPI::GetShmDims; Change default units to consistent MB in init args; Want last element not past last element in MemoryManagerCache.cc 2020-06-14 13:26:01 -04:00
a25e4b3d0c pred 32/64 for float/double instead of 8 in VLA patch 2020-06-13 14:44:37 +02:00
d1210ca12a switch to double/float instead of float64_t/float32_t in VLA patch 2020-06-13 13:59:32 +02:00
36ea0e222a type traits for ComplexF/D in VLA patch; cosmetics in VLS intrinsics 2020-06-13 13:42:35 +02:00
65e6e7da6f Merge pull request #291 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-06-12 20:42:32 -04:00
b5e87e8d97 summit compile fixes 2020-06-12 18:16:12 -04:00
5f5807d60a cleanup 2020-06-12 14:48:23 -04:00
92281ec22d add 3 op Mult for VLA 2020-06-12 18:49:05 +02:00
87266ce099 comment out fcmla in vector types: need also MultAddReal 2020-06-12 18:37:19 +02:00
2a23f133e8 reenable fcmla for VLA 2020-06-12 17:30:38 +02:00
8dbf790f62 correct tbl2 for sp 2020-06-12 17:12:34 +02:00
2402b4940e vec_imm in float 2020-06-12 15:17:38 +02:00
2111052fbe apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7 2020-06-12 14:49:19 +02:00
7974acff54 merged sycl to feature-gpt 2020-06-12 06:49:38 -04:00
f0d17d2b49 Added Baryon3pt code 2020-06-12 11:35:52 +01:00
244c003a1b Updated Baryon code 2020-06-12 11:00:25 +01:00
0174f5f742 look for librt when using shm=shmopen 2020-06-11 16:50:43 +01:00
32b2b59be4 Offload 2020-06-10 20:36:26 -04:00
86bb0cc24b Keep on GPU 2020-06-10 20:00:00 -04:00
84c19587e7 Offload 2020-06-10 19:59:31 -04:00
237ce92540 Offload loops 2020-06-10 19:59:11 -04:00
a7ffc61e82 acceleratorSIMTlane() 2020-06-10 19:58:33 -04:00
fd97f64612 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-10 12:58:13 -04:00
8720aecb80 Offload more loops 2020-06-10 12:57:55 -04:00
cdf0a04fc5 Merge branch 'develop' into sycl 2020-06-09 04:00:12 -04:00
616d3dd737 CCommpile updates 2020-06-08 18:57:41 -04:00
8b066baca8 Implement transient mechanism 2020-06-08 18:28:53 -04:00
e97f3688db Fix the HMC issue - kernel was launchnig asynchronously 2020-06-08 17:01:15 -04:00
433766ac62 revert Add/SubTimesI and prefetching in stencil
This reverts commit 9b2699226c.
2020-06-08 12:02:53 +02:00
93a37c8f68 test prefetch to L2 in stencil 2020-06-08 09:39:50 +02:00
89a1e78390 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 23:20:37 -04:00
ffbb3fc02c Merge pull request #287 from felixerben/baryon-cleaner
slightly cleaner baryon 2pt code
2020-06-05 22:54:52 -04:00
5a73ef3647 Minor tweak to compile 2020-06-05 21:50:15 -04:00
87e5d2f4b7 Merge branch 'sycl' of https://www.github.com/paboyle/Grid into sycl 2020-06-05 17:32:21 -07:00
d720f10758 Liink error fix 2020-06-05 17:29:20 -07:00
14fcd0912a Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 19:14:17 -04:00
3111c0bd4f Single precisiono hardwire 2020-06-05 19:13:27 -04:00
e03064490e Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 18:53:39 -04:00
1a4c8c3387 Global edit with change to View usage. autoView() creates a wrapper object that closes the view when scope closes. 2020-06-05 18:52:35 -04:00
2b1e259441 Decode of SYCL devices fix 2020-06-04 17:16:55 -07:00
f39c2a240b Priintinig and device memory size detection 2020-06-04 14:58:03 -04:00
0d95805cde Print improvement 2020-06-03 22:50:32 -04:00
f67830587f Accelerator loop use 2020-06-03 22:50:09 -04:00
6bf7f839ff Better printing and logging 2020-06-03 09:28:57 -04:00
e3147881a9 Cache scheme 2020-06-03 09:23:48 -04:00
9872c76825 introduce AddTimesI and SubTimesI; slight benefit in operators, but < 1%; breaks all other impls 2020-06-03 15:20:13 +02:00
fb559614ad Initialise meemory manager 2020-06-03 09:12:47 -04:00
e93e12b6a4 More verbose SYCL setup 2020-06-03 09:12:11 -04:00
0c3112cd94 Use view mechanism 2020-06-03 09:11:51 -04:00
8cfd5d2639 Need lattice view 2020-06-03 09:11:28 -04:00
1c9f20b15e Views must be closed 2020-06-03 09:10:29 -04:00
32237895bd Reorg memory manager for O(1) hash table 2020-06-03 09:09:52 -04:00
5ee3ea2144 round-up after testing of prefetches in stencil close 2020-06-03 11:58:20 +02:00
c5c2dbc0ce Optional CUDA info 2020-06-02 14:21:49 -04:00
9fcb47ee63 Explicit error message instead of infinite loop in GlobalSharedMemory::GetShmDims 2020-06-02 07:44:38 -04:00
5050833b42 revert changes due to performance penalty in Wilson using MPI 2020-06-02 13:08:57 +02:00
7bee4ebb54 correct predication for svcadd 2020-06-02 10:51:39 +02:00
71cf9851e7 correct type for vecd in TimesI and TimesMinusI 2020-06-02 10:44:15 +02:00
b4735c9904 correct zero in svcadd 2020-06-02 10:38:05 +02:00
9b2699226c use fcadd in TimesI and TimesMinusI instead of tbl and neg 2020-06-02 10:32:44 +02:00
5f52804907 update calculation of data 2020-05-30 10:55:17 +02:00
936071773e correct throughput in wilson and dwf 2020-05-29 22:15:59 +02:00
1732f9319e more mods; counters seem to work correctly 2020-05-29 18:44:00 +02:00
91c81cab30 some corrections; compiles on my laptop; untested 2020-05-29 18:19:22 +02:00
38164f8480 include counters in WilsonFermionImplementation.h 2020-05-29 17:59:26 +02:00
f013979791 add counter support in WilsonFermion.h 2020-05-29 17:13:59 +02:00
e947b563ea add space in stencil output 2020-05-29 17:11:17 +02:00
5cb3530c34 enable counters in Benchmark_wilson 2020-05-29 15:44:52 +02:00
250008372f update SVE readme 2020-05-29 15:44:25 +02:00
1d252d0922 Accelerator inline 2020-05-28 11:45:25 -04:00
006cc8a8f1 Staggereed move to accelerator 2020-05-28 08:33:06 -04:00
4fedd8d29f switch to MPI_THREAD_SERIALIZED instead of SINGLE 2020-05-27 14:08:34 +02:00
cf2938688a Sycl unhappy fix 2020-05-25 08:36:53 -07:00
ee63721bad int unhappiness sycl fix 2020-05-25 08:36:24 -07:00
22c5168d70 Sycl happier 2020-05-25 08:35:56 -07:00
949ac3cd24 Must avoid non-trivial copy constructors 2020-05-25 08:35:28 -07:00
7bc0166c1c SYCLL maknig happy - must avoid non ttrivial copy constructors 2020-05-25 08:34:19 -07:00
cb0d1b3399 hopefullly fix buildd fail 2020-05-24 21:27:00 -04:00
d1f1ccc705 HIP changes 2020-05-24 21:18:49 -04:00
c7519a237a Assertions fail on HIP foor unknown reasons - dedbugging 2020-05-24 14:02:47 -04:00
32be2b13d3 Updates for HiP 2020-05-24 14:00:55 -04:00
92b342a477 Hip reduction too 2020-05-24 13:50:28 -04:00
556da86ac3 HIP fp16 2020-05-24 13:41:58 -04:00
8285e41574 View location / access mode 2020-05-21 16:14:41 -04:00
f999408e92 View locatoin and access mode 2020-05-21 16:14:20 -04:00
a7abda89e2 View location & access mode 2020-05-21 16:13:59 -04:00
7860a50f70 Make view specify where and drive data motion - first cut.
This is a compile tiime option --enable-unified=yes/no
2020-05-21 16:13:16 -04:00
6ddcef1bca fix build error enabling fcmla/mac in vector types for VLA 2020-05-21 21:21:03 +02:00
8c5a5fdfce disable fcmla in vector type building for VLA 2020-05-21 19:41:42 +02:00
046b1cbbc0 enable fcmla in tensor arithmetics; fixed-size works, VLA does not compile 2020-05-21 19:39:07 +02:00
a65ce237c1 clean up; Exch1 VLA sp+dp integrate, tested, working 2020-05-21 09:48:06 +02:00
cd27f1005d clean up; Exch1 sp integrate, tested, working 2020-05-21 08:45:43 +02:00
f8c0a59221 clean up; Exch1 dp integrate, tested, working 2020-05-21 02:48:14 +02:00
832485699f save some cycles in HtoD and DtoH by direct instead of multi-pass conversion 2020-05-20 23:04:35 +02:00
81484a4760 symmetrize Mult and MultAddComplex 2020-05-20 22:36:45 +02:00
9a86059761 symmetrize VLA and fixed size build messages 2020-05-20 20:05:42 +02:00
b780b7b7a0 guard prevents multiple TOFU messages 2020-05-20 19:20:59 +02:00
9e085bd04e guard prevents multiple A64FX build messages 2020-05-20 19:16:30 +02:00
6c6812a5ca GB/s output 2020-05-20 12:26:57 +01:00
8358ee38c4 pull develop 2020-05-19 08:56:18 -04:00
1f154fe652 some cleanup in BaryonUtils 2020-05-19 13:48:56 +01:00
d708c0258d some cleanup in BaryonUtils 2020-05-19 13:48:00 +01:00
a7635fd5ba summit mem 2020-05-18 17:52:26 -04:00
6b6bf537d3 comment out mac in vector types 2020-05-18 20:36:16 +02:00
323a651c71 correct typo 2020-05-18 19:58:27 +02:00
9f212679f1 support fcmla in vector_types, untested 2020-05-18 19:55:18 +02:00
032f7dde1a update SVE readme, asm generator 2020-05-18 19:10:36 +02:00
ebb60330c9 Automatic data motion options beginning 2020-05-17 16:34:25 -04:00
5aa60be17d SerialisableClassName method for serialisable enum, and boolean to test if a serialisable object is an enum 2020-05-15 20:00:34 +01:00
50b1db1e8b implemented correct _m form (using 3 operands instead of 2) 2020-05-15 10:01:05 +02:00
015d8bb38a introduced assertions in Benchmark_wilson, removed data output from Benchmark_dwf 2020-05-15 09:15:50 +02:00
10a34312dc some fixed-size code clean up 2020-05-14 23:20:16 +02:00
db8c0e7584 replaced _x form with _m form when using even/odd predication 2020-05-14 23:17:35 +02:00
32fbdf4fb1 Merge pull request #5 from paboyle/develop
Sync upstream
2020-05-13 09:02:56 +02:00
a9847aa866 Dependence fix 2020-05-12 20:03:37 -04:00
2e652431e5 No compile on summiit fix 2020-05-12 18:56:47 -04:00
8b5b55b682 Make tests all compile ccurrent Grid, mostly MdagM removal of norms fixes but a few minor
issues fiixed too
2020-05-12 17:57:24 -04:00
0e3c49f687 TransposeIndex was broken by Christoph 2020-05-12 17:57:01 -04:00
cb7ee37562 Close expressions in arg to cshift 2020-05-12 17:56:40 -04:00
82f71643a4 Remove the norm in MdagM 2020-05-12 17:55:53 -04:00
d15ccad8a7 switched to vec* in Reduce 2020-05-12 20:41:14 +02:00
0009b5cee8 updated SVE_README 2020-05-12 19:02:33 +02:00
20d1941a45 enabled asm kernels for fixed-size A64FXFIXEDSIZE 2020-05-12 19:01:12 +02:00
d24d8e8398 Use X-direction as more bits meaningful on CUDA.
2^31-1 shoulddd always bee enough for SIMD and thread reduced local volume

e.g. 32*2^31 = 2^36 = (2^9)^4 or 512^4 ias big enough.

Where 32 is gpu_threads * Nsimd = 8*4
2020-05-12 10:35:49 -04:00
162e4bb567 no automatic prefetching for now 2020-05-12 07:01:23 -04:00
07c0c02f8c Speed up Cshift 2020-05-11 17:02:01 -04:00
8c31c065b5 Keep the Vector fixed to protect it from realloc 2020-05-11 17:00:30 -04:00
b7c76ede29 Removed some assertions in Test_simd and removed exit() in Reduce 2020-05-11 22:43:00 +02:00
05edf803bd corrected typo 2020-05-12 03:59:59 +09:00
b1c86900b2 Merge pull request #4 from paboyle/develop
merge
2020-05-11 20:59:29 +02:00
78b8e40f83 switched to gcc's internal data types 2020-05-11 18:11:23 +02:00
fc2e9850d3 temporarily enable TOFU by default when using A64FX or A64FXFIXEDSIZE 2020-05-11 13:25:02 +02:00
ffaaed679e MPI_THREAD_SINGLE hack for Fugaku, enabled by -DTOFU 2020-05-11 13:21:39 +02:00
bbbee5660d First compiile on HiP 2020-05-10 05:28:09 -04:00
ea08f193e7 Allocator cache spliit into large/small pools 2020-05-10 05:24:26 -04:00
2bb2c68e15 Separate pools for small and large allocations cache 2020-05-09 22:57:21 -04:00
efe5bc6a3c Split allocator cache into two pools of different sizes 2020-05-09 22:27:56 -04:00
b2fd8b993a fixed-size clean up 2020-05-09 22:53:42 +02:00
291ee8c3d0 updated fixed-size implementation; only Exch1 and prefetches missing 2020-05-09 22:18:02 +02:00
e1a5b3ea49 unions for tables eliminate explicit loads, gcc does not complain 2020-05-09 21:21:57 +02:00
55a55660cb reverted changes 2020-05-09 12:48:42 +02:00
384da487bd Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-05-08 18:55:11 -04:00
ee1de82a53 Working ITT benchmark again 2020-05-08 18:54:50 -04:00
2b576fc185 Comment deadd codde remove 2020-05-08 18:54:29 -04:00
52081acfa5 NVCC compile fixes 2020-05-08 13:14:12 -04:00
b01b7f761a Merge pull request #283 from DanielRichtmann/feature/minor-fixes
Some small fixes
2020-05-08 10:52:03 -04:00
c83471bfd0 Fix missing checkerboards for adj und conjugate 2020-05-08 16:44:03 +02:00
ab0c5d77fb Correct NonHermitianSchurOperatorBase 2020-05-08 16:44:02 +02:00
779e3c7442 Const-correctness for retrieval routines of GridStopWatch 2020-05-08 16:43:52 +02:00
0c570824f2 Add missing declaration of GridCmdOptionInt 2020-05-08 16:43:51 +02:00
f8b8e00090 Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place.

Will move GpuInit iinto Accelerator.cc from Init.cc
Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2020-05-08 06:23:55 -07:00
0dd1bdfa94 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-05-08 09:21:43 -04:00
1d65e2f62c Slightly faster Chebyshev; ifdef'ed out the fastest until tested numerics
Lifteed from HDCR setup
2020-05-08 09:20:54 -04:00
93920c4811 Remove verbose 2020-05-08 09:19:54 -04:00
6859a3e1d4 Schur operator 2020-05-08 09:19:12 -04:00
21ca182c36 Comments remove 2020-05-08 09:18:24 -04:00
ceb8b374da API change v3 2020-05-08 15:04:44 +02:00
4bc2ad2894 API change v2 2020-05-08 15:00:25 +02:00
798af3e68f retry changing StoD API 2020-05-08 14:34:59 +02:00
b0ef2367f3 testing alternate call to PrecisionChange 2020-05-08 14:22:44 +02:00
71a7350a85 changed 2nd argument in Reduce to native vector type 2020-05-08 12:26:51 +02:00
6f79369955 trying to get rid of macro definition error 2020-05-08 12:19:24 +02:00
f9cb6b979f corrected more typos 2020-05-08 12:11:01 +02:00
ed4d9d17f8 corrected type 2020-05-08 12:09:22 +02:00
fbed02690d some changes in breaking out A64FX: use -DA64FXFIXEDSIZE for fixed size, but also define GEN 2020-05-08 12:05:31 +02:00
39f3ae5b1d corrected more types 2020-05-08 11:07:14 +02:00
e64bec8c8e pulled SVE typedefs out of Optimization 2020-05-08 11:04:21 +02:00
0893b4e552 fixed typos in PrecisionChange 2020-05-08 10:59:07 +02:00
92f0f29670 fixed double overloading vecf in Div, corrected typos 2020-05-08 10:57:23 +02:00
48a340a9d1 GEN seems to defined by default -> some fixes applied 2020-05-08 10:47:49 +02:00
f45621109b placed typedefs in Optimization 2020-05-08 10:41:52 +02:00
32d1a0bbea added even more debug output 2020-05-08 10:39:26 +02:00
267cce66a1 added more debug output 2020-05-08 10:29:28 +02:00
3417147b11 added real fma, corrected typos in tbls; integrated, must supply A64FXGCC with GEN in configure 2020-05-08 10:20:19 +02:00
b338719bc8 first transition to fixed-size done, excl. Exch; next step: integration 2020-05-07 22:33:28 +02:00
2b81cbe2c2 first attempt to introduce tables using fixed-size; still incomplete 2020-05-07 22:01:19 +02:00
acff9d6ed2 transition to fixed size data types almost done; still incomplete 2020-05-07 21:24:07 +02:00
053b4dd495 Merge pull request #282 from felixerben/baryon-reversal
Baryon reversal
2020-05-07 18:09:17 +01:00
a306a49788 first mods for fixed size; still incomplete 2020-05-07 19:07:49 +02:00
42bb5f0721 asserrtion 2020-05-07 18:06:12 +01:00
253bcc3426 back to old version 2020-05-07 18:03:17 +01:00
a887206413 Merge pull request #281 from felixerben/feature/baryonSpeedup
Feature/baryon speedup
2020-05-07 13:41:29 +01:00
591ebb6213 Merge branch 'develop' of github.com:paboyle/Grid into feature/baryonSpeedup 2020-05-07 11:13:21 +01:00
56e2f7d088 deleted test routines. cleaned up fast version. assert Ns=4,Nc=3. 2020-05-07 10:03:45 +01:00
7ef03c5368 updated SVE readme 2020-05-06 16:30:37 +02:00
525418abfb Merge pull request #273 from lehner/feature/gpt
Feature/gpt
2020-05-06 10:10:51 -04:00
5f780806c2 Merge pull request #279 from paboyle/bugfix/nvcc-config
configure fix for nvcc with extra arguments as CXX
2020-05-06 10:07:52 -04:00
3c6ffcb48c Merge branch 'develop' into feature/gpt 2020-05-06 15:03:35 +02:00
87984ece7d add Lattice_basis.h 2020-05-06 08:47:18 -04:00
e9b295f967 Synchronize blocking infrastructure with GPT 2020-05-06 08:42:28 -04:00
224cbf0453 Merge pull request #280 from mmphys/bugfix/ET_go_home
Bugfix/et go home
2020-05-05 17:56:51 -04:00
c1e57d4357 Merge branch 'develop' into bugfix/ET_go_home
* develop:
  SYCL prep - no sycl just make it compile through DPC++
  dpc++ didn't like rdtsc()
  Make compile if HAVE_LIME=0
  Lime optional
2020-05-05 22:35:04 +01:00
28a1fcaaff First compile against SYCL 2020-05-05 11:13:27 -07:00
6b64727161 disable comments 2020-05-05 05:05:36 -04:00
04863f8f38 debug new AcceleratorView 2020-05-04 16:07:03 -04:00
04927d2e40 SYCL prep - no sycl just make it compile through DPC++ 2020-05-04 10:28:29 -07:00
7caed4edd9 dpc++ didn't like rdtsc() 2020-05-04 10:27:05 -07:00
59c51d2c35 Make compile if HAVE_LIME=0 2020-05-04 10:26:20 -07:00
ff53b231c8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-05-04 10:25:10 -07:00
fc19cf905b Lime optional 2020-05-04 10:24:48 -07:00
2a1387e992 rankInnerProduct 2020-05-03 17:27:11 -04:00
9bfa51bffb cleanup comment 2020-05-03 09:12:52 -04:00
38532753f4 interface cleanup 2020-05-03 08:58:32 -04:00
949be9605c fix pragmas 2020-05-02 16:20:03 -04:00
63cf201ee7 Add AdviseInfrequentUse 2020-05-02 11:38:42 -04:00
c8af498a2a BinaryIO fix for alternative little-endian format name (used in 96I ensemble) 2020-05-01 03:45:50 -04:00
ddb192bac7 re-work double precision promotion for summit 2020-04-30 16:09:57 -04:00
7666300a6f Merge branch 'develop' into bugfix/ET_go_home
* develop:
  Basis rotate stack passig to GPU reduction
  Clean up warning
2020-04-30 20:10:32 +01:00
4a4b9e305d Fix: strToVec enters infinite loop and exhausts memory if operator>> fails before the end of string, e.g. if parsing "0_0_0" for momentum instead of "0 0 0". 2020-04-30 19:40:04 +01:00
9b2d2d0fc3 Basis rotate stack passig to GPU reduction 2020-04-30 12:31:07 -04:00
5011753f4f Clean up warning 2020-04-30 10:23:48 -04:00
dbaeefaeef All Eigen::TensorMap objects are fixed (i.e. cannot be dynamically resized) 2020-04-30 15:02:51 +01:00
dee96cbf82 Added workaround in configure to still catch Cuda compiler when nvcc with extra arguments (eg -ccbin) is used as CXX 2020-04-29 10:37:11 -04:00
dd3ebc2ce4 Slow compile on NVCC switch off conserved current 2020-04-29 08:43:12 -04:00
103e7ae2f0 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-04-29 03:05:36 -04:00
29ae5615c0 Seqeuential fix 2020-04-29 03:05:15 -04:00
6240e02619 added assertion to avoid potential infinite loop 2020-04-27 18:50:53 +01:00
f4033ad8cb baryon speedup by a factor 2 2020-04-27 17:46:14 +01:00
5abec5b8a9 SVE_readme update, update Grid_vector_types.h 2020-04-25 13:48:26 +02:00
499edc0636 updated SVE_README.txt; defined ARMCLANGCOMPAT macro 2020-04-25 13:41:24 +02:00
d990e61be3 armclang 20.1 settings in SVE readme 2020-04-25 12:11:43 +02:00
3edb2dc2da removed -static from gcc CXXFLAGS 2020-04-24 13:04:34 +02:00
f1fe444d4f blocked precision promotion infrastructure upgrade 2020-04-24 06:27:20 -04:00
345721220e resolved merge conflict 2020-04-24 10:14:21 +02:00
6db68d6ecb added SVE configure for armclang and gcc 2020-04-24 10:10:47 +02:00
dae820aa96 Merge pull request #277 from mmphys/bugfix/grid-config
Bugfix/grid config
2020-04-23 10:26:54 -04:00
5daf176f4a Updated to expose GRID_CXXLD in addition to CXXLD.
NB: CXXLD required as this is what drives linking behaviour.
2020-04-23 15:25:53 +01:00
e96c86ec14 Make grid-config message more specific for --cxx and --cxxld 2020-04-23 13:10:45 +01:00
09f0963d1f changes in configure.ac ; to be verified 2020-04-23 11:27:03 +02:00
6f44e3c192 reverted changes in configure.ac ; included SVE configure readme 2020-04-23 11:18:50 +02:00
c2c3cad20d Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-04-23 04:35:42 -04:00
edec9ee2e2 Conserved current rewrite done. Zmobius working 2020-04-23 04:34:01 -04:00
ed70cce542 Test for 5D DWF obserevables 2020-04-23 04:29:45 -04:00
4701201b5f grid-config: Expose CXXLD (for GPU build) and update help 2020-04-22 18:42:30 +01:00
5893888f87 removed default no-strict-aliasing for gcc-10.0.1 exclusively 2020-04-22 19:29:55 +02:00
39b448affb Merge remote-tracking branch 'origin/develop' into feature/a64fx-2 2020-04-22 17:34:12 +02:00
e54a8f05a9 Exchange1 with generic version for now, should use svtbl2 in final version 2020-04-20 22:45:27 +02:00
0782b76ed4 Merge pull request #274 from paboyle/feature/zmobius_paramcompute
ZMobius parameter computation
2020-04-20 14:39:29 -04:00
0896f2cead Added missing include guards in bigfloat_double.h 2020-04-20 10:30:38 -04:00
181709bba4 Merge branch 'develop' into feature/zmobius_paramcompute 2020-04-20 09:12:34 -04:00
64b72fc17f testing gcc 10.0.1: build errors in Exchange1 using -DA64FX and in Lattice_base.h building Dslash only 2020-04-19 01:25:40 +02:00
091d5c605e towards more precise blocking 2020-04-17 04:25:28 -04:00
6fdce60492 revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0) 2020-04-16 22:43:32 +02:00
90229cfb0f Merge pull request #270 from milc-qcd/feature/CGinfo
feature/CGinfo
2020-04-16 11:46:08 -04:00
0475c46ecb Merge pull request #256 from djm2131/feature/BiCGSTAB
Import BiCGSTAB solvers and tests
2020-04-16 11:45:15 -04:00
3cca10e617 Merge pull request #276 from nils-asmussen/fix/regression_nt
fix regression in tests/core/Test_qed.cc
2020-04-16 11:42:39 -04:00
327da332bb Merge branch 'develop' of https://github.com/paboyle/Grid into feature/gpt 2020-04-16 11:30:17 -04:00
852db4626a re-introduced HOTFIX cause Grid binaries give wrong results otherwise; checked in good gridverter.py 2020-04-15 18:22:19 +02:00
43dc2814dd fix regression in core/Test_qed.cc 2020-04-15 16:10:15 +01:00
6504a098cc 999 GiB/s Wilson; 694 GiB/s DW (DP) 2020-04-15 15:06:52 +02:00
79a385faca disabled armclang hotfix cause armclang 20.0 performance gets a little 2020-04-15 11:46:55 +02:00
c12a67030a 980 GiB/s Wilson; 680 GiB/s DW (DP) 2020-04-15 10:55:06 +02:00
581392f2f2 now with pf, best results so far using intrinsics+pf 2020-04-12 22:06:14 +02:00
113f277b6a enable dslash asm using -DA64FXASM, additionaly -DDSLASHINTRIN for intrinsics impl 2020-04-11 04:55:01 +02:00
f3a8d039a2 Merge branch 'feature/hdcr' into develop 2020-04-10 22:01:52 -04:00
974586bedc Dslash finally works; cleaned up; uses MOVPRFX in assembly 2020-04-10 22:26:40 +02:00
4e864e56c9 develop pull 2020-04-10 17:19:18 +01:00
014dbfa464 Compile fix with OpDirAll 2020-04-10 11:57:09 -04:00
3b0e07882f Adding another form of polynomial 2020-04-10 11:28:33 -04:00
8e81a811d0 Merge branch 'feature/hdcr' into develop 2020-04-10 11:14:49 -04:00
aa13118127 Missing conjugate already fixed in develop 2020-04-10 11:11:24 -04:00
6cdb09c884 Faster copy region 2020-04-10 11:10:52 -04:00
a65bc64f10 Accelerator peek poke 2020-04-10 11:09:59 -04:00
11dec4883c Don't throw assert 2020-04-10 11:09:11 -04:00
afa458c812 Extra solvers 2020-04-10 11:08:19 -04:00
dc50190b8f Faster GPU basis rotation
May need to later include Regensburg optimised CPU variant
2020-04-10 11:06:04 -04:00
160f78c1e4 changed debug output to variable direct 3 2020-04-10 12:23:07 +02:00
7e4e1bbbc2 changed debug output to variable direct 2 2020-04-10 12:22:04 +02:00
e699b7e9f9 changed debug output to variable direct 2020-04-10 12:18:30 +02:00
a28bc0de90 debug register address test in WilsonHand 2020-04-10 12:07:45 +02:00
14d0fe4d6c added predication in WilsonHand 2020-04-10 12:04:00 +02:00
0ad2e0815c debug output in WilsonHand 2020-04-10 11:56:29 +02:00
1c8ca05e16 Merge branch 'feature/a64fx-2' of https://github.com/nmeyer-ur/Grid into feature/a64fx-2 2020-04-09 23:32:19 +02:00
dc9c8340bb switched to DSLASHINTRIN for A64FX Dslash intrinsics 2020-04-09 23:30:23 +02:00
19eef97503 specialized A64FX Dslash kernels 2020-04-09 23:25:25 +02:00
635246ce50 corrected typo 2020-04-09 21:42:50 +02:00
5cdbb7e71e fixed A64FX Dslash; compiles, but does not specialize -> assertion 2020-04-09 21:23:39 +02:00
8123590a1b changes 2020-04-09 16:45:47 +02:00
86c9c4da8b changes 2020-04-09 16:40:06 +02:00
cd1efee866 changes 2020-04-09 16:35:13 +02:00
bd310932f7 changes 2020-04-09 16:32:31 +02:00
304762e7ac changes 2020-04-09 16:26:01 +02:00
d79ab03a6c changes 2020-04-09 16:19:25 +02:00
d5708e0eb2 more changes 2020-04-09 15:43:34 +02:00
123f6b7a61 more changes 2020-04-09 15:17:19 +02:00
2b6457dd9a added xp/xm recon accum 2020-04-09 15:13:19 +02:00
b367cbd422 defined ADD_RESULT 2020-04-09 15:08:45 +02:00
e252c1aca3 addressing 2020-04-09 15:03:12 +02:00
b140c6a4f9 addressing 2020-04-09 15:01:15 +02:00
326de36467 revised sU addressing scheme 2020-04-09 14:44:25 +02:00
9f224a1647 fixed typo in single 2020-04-09 14:30:21 +02:00
bb46ba9b5f fixed array size in single 2020-04-09 14:28:45 +02:00
dd5a22b36b revised declarations 2020-04-09 14:21:27 +02:00
1ea85b9972 Disabled build message 2020-04-09 13:47:21 +02:00
8fb63f1c25 added A64FX Wilson kernels single precision 2020-04-09 13:41:04 +02:00
77fa586f6c introduced A64FX Wilson kernels 2020-04-09 13:30:06 +02:00
96e8e44fd4 Merge pull request #2 from DanielRichtmann/feature/fused-innerproduct-norm2
Fused innerProduct + norm2 on first argument operation
2020-04-06 13:16:58 +02:00
5fc8a273e7 Fused innerProduct + norm2 on first argument operation 2020-04-06 11:52:29 +02:00
d671a63e78 Update README.md 2020-04-03 19:52:15 +01:00
15238e8d5e reduce acle works, clean up 2020-04-03 20:40:44 +02:00
b27e31957a reduce acle revised 2020-04-03 19:46:15 +02:00
46927771e3 reduce acle still needs overhaul 2020-04-03 19:30:48 +02:00
d8cea77707 define simd width in header 2020-04-03 19:22:25 +02:00
5f8a76d490 clean up, reduction in acle 2020-04-03 19:18:24 +02:00
28d49a3b60 build problem resolved 2020-04-03 16:52:48 +02:00
b4c624ece6 added A64FX support 2020-04-03 15:43:23 +02:00
2c22db841a Added momentum scaling to scalar HMC theories in order to follow UKQCD/CPS conventions 2020-04-02 17:38:47 +01:00
856d168e41 global sum over vectors of uint64_t 2020-03-29 07:56:05 -04:00
6235c7ba98 IPP path fix in configure 2020-03-27 17:23:29 +00:00
7e13724882 removing Hadrons 2020-03-27 12:03:32 +00:00
b6cbdd2aa3 Merge pull request #1 from DanielRichtmann/feature/read-openqcd
Feature/read openqcd
2020-03-26 17:39:04 +01:00
a2188ea875 remove debugging printf from WilsonKernelsImplementation 2020-03-26 09:12:36 -04:00
989af65807 Check in parallel reader for openqcd configs 2020-03-24 11:20:54 +01:00
60db3133d3 make trace,adj,transpose unary operators 2020-03-16 17:59:56 -04:00
c9b737a4e7 make trace,adj,transpose unary operators 2020-03-16 17:58:30 -04:00
037bb6ea73 Check in reader for openqcd configs
This reader is suboptimal in the sense that it opens the entire config on every MPI rank.
2020-03-16 14:28:02 +01:00
05ebc458e2 Merge pull request #260 from mmphys/feature/distil
Distillation: save eigenvalues of the Laplacian for all timeslices
2020-03-13 14:00:21 +00:00
3753508957 Making change 1) as simple as possible 2) as much like MSink/Point.hpp as possible 2020-03-12 13:47:51 +00:00
c1677fccf6 Merge branch 'develop' into feature/distil
* develop:
  bugfix ZPerambulator
  registered module supporting ZMobius action
  changed to push_back according to request
  Added Hadrons_Error in case blockSize is set too large
  bugfix in perambulator module

# Conflicts:
#	Hadrons/Modules/MDistil/Perambulator.hpp
2020-03-12 12:45:18 +00:00
35e8e31749 Merge pull request #272 from mmphys/feature/ZPeramb
bugfix ZPerambulator
2020-03-12 12:28:04 +00:00
34813e9b04 Merge branch 'develop' into feature/ZPeramb 2020-03-12 12:27:56 +00:00
373cf61abb bugfix ZPerambulator 2020-03-12 11:44:43 +00:00
4e8fbc4b49 Merge pull request #271 from mmphys/feature/ZDistil
registered module supporting ZMobius action
2020-03-12 10:54:07 +00:00
516ac1d4d5 registered module supporting ZMobius action 2020-03-12 10:52:27 +00:00
318f63eb34 Merge pull request #268 from mmphys/a2a-error-log
Added Hadrons_Error in case blockSize is set too large
2020-03-11 11:09:00 +00:00
16503d7532 Merge pull request #267 from mmphys/feature/distil-bugfix
bugfix in perambulator module
2020-03-11 11:08:23 +00:00
0fa93383b7 changed to push_back according to request 2020-03-11 09:05:01 +00:00
0a827aa7bf Added Hadrons_Error in case blockSize is set too large 2020-03-11 08:52:52 +00:00
165c68e28e Change TrueResiduals to TrueResidualShift and IterationsToComplete to IterationsToCompleteShift 2020-02-29 17:51:51 -06:00
b32b1ca642 bugfix in perambulator module 2020-02-26 12:06:45 +00:00
9479bc8486 Make IterationsToComplete and TrueResidual externally accessible 2020-02-19 17:43:57 -06:00
8a5c13d5fb Still fast moving in changes 2020-02-06 17:57:26 -05:00
bdccb0c91f Working 2 types of decomposition 2020-02-06 17:26:55 -05:00
68b45f6444 Lower left/upper right region cut paste 2020-02-06 15:50:26 -05:00
ef9b3e658a extra typedef 2020-02-06 15:47:14 -05:00
b9ca40cc44 More precise power method at start 2020-02-06 10:09:14 -05:00
2f421a5db1 Commeent fix 2020-02-06 10:08:27 -05:00
10192dfc71 Wall source momenta must be specified for spatial components only.
So we don't break existing scripts, allow momentum in time direction as well, but only if zero.
Fail early, so do the check in setup()
2020-01-31 15:02:03 +00:00
c69a3b6ef6 When saving eigenvectors, LapEvec now saves eigenvalues for every timeslice as well.
I.e. nT x nVec eigenvalues are saved in FileName.evals.conf.h5.
A new named tensor, "TimesliceEvals" can be used to simplify restoring these from disk.
NB: The changes in BaseIO add support so that Eigen tensors can be easily used in MPI operations, e.g. GlobalSum.
See LapEvec.hpp for an example of how this is done.
2020-01-29 21:20:20 +00:00
852fc1b001 True Hierachical multigrid for DWF 2020-01-27 13:45:10 -05:00
2b5de5bba5 MdagM operator without norm option 2020-01-27 13:44:30 -05:00
2e85cae74e Add Jacobi polynomials 2020-01-27 13:43:49 -05:00
76c823781e Much faster coarsening 2020-01-27 13:43:19 -05:00
114db3b99d Optional MdagM without norms 2020-01-27 13:42:51 -05:00
49e123dbda Use explicit linalg calls to get coalesce optimisations on GPU 2020-01-27 12:44:51 -05:00
8cec294ec9 Make CG a bit less verbose as gettign annoying in nested algorithms.
Can use Iterative logging if you want to see more
2020-01-27 12:44:04 -05:00
eb5b720e94 Normal Equations can be used in HDCR now 2020-01-27 12:43:29 -05:00
b2736ec80b Make PrecGCR recursive - it can precondition itself 2020-01-27 12:42:48 -05:00
086256a032 Less sloppy convergence test on PowerMethod 2020-01-27 12:41:59 -05:00
afc7426f39 Much bigger pointer cache in case of Nvidia due to cost of setting up UVM allocations 2020-01-27 12:41:16 -05:00
7c061e20c9 All directions of dirac operator for fastt coarsening 2020-01-27 12:40:13 -05:00
e5d1c09665 Faster DhopDirAll for little dirac operator coarsening 2020-01-27 12:38:54 -05:00
8016a465ae Remove extraneous variable 2020-01-27 12:35:37 -05:00
d8b9742092 DhopDirAll for faster matrix elements of little Dirac operator 2020-01-27 12:34:54 -05:00
1bd87c35d7 Read coalescing on Nvidia 2020-01-27 12:29:56 -05:00
fa856c9669 Disable information message 2020-01-27 12:28:46 -05:00
48008e4d8b Thread coordinate creation loop 2020-01-27 12:28:16 -05:00
55cdb17691 Integer divide for blocking 2020-01-27 12:27:45 -05:00
2ed39ebb7a Perambulator won't even allocate memory for unsmeared sinks unless the filename is specified.
Prior to this update, memory is allocated regardless of whether these are requested.
2020-01-24 13:01:06 +00:00
96671bbb24 Added ability to pass callback to MADWF that is called every inner iteration and allows user to, for example, adjust the inner solver tolerance depending on residual
Added a general implementation of the Remez algorithm for producing arbitrary rational polynomial approximation with optional restriction to even/odd polynomials
Added implementation of computation of ZMobius parameters
Added Test_zMADWF_prec to test ZMobius in MADWF
2020-01-17 12:45:30 -08:00
554542b773 Merge branch 'feature/hdcr' of https://github.com/paboyle/Grid into feature/hdcr 2020-01-06 11:47:56 -05:00
03da4040e2 Make summit happy 2020-01-06 11:47:48 -05:00
e583035614 Change to interface to minise comms in evaluating coarse space operator 2020-01-06 11:43:59 -05:00
3c3d6a94f3 OPtimising the force term a bit 2020-01-04 03:16:23 -05:00
205ea4bbb2 More verboose Lanczos 2020-01-04 03:13:40 -05:00
039eb7b2eb Make the force term and coarsening multigrid more optimised 2020-01-04 03:12:17 -05:00
f7e4bd1f6d Getting more optimised 2020-01-04 03:11:53 -05:00
0afecfcae7 Nearing well optimised state 2020-01-04 03:11:19 -05:00
ba40a3f763 Alternate low pass filter option 2020-01-03 05:29:09 -05:00
aa920aa532 Improved DWF multigrid 2019-12-28 10:32:35 -05:00
c0d8e4dce5 Improved Multigrid for DWF 2019-12-28 10:32:15 -05:00
0ca1992151 Remove warning in tensor layout comparison. Make default names and index names visible for PerambTensor and NoiseTensor 2019-12-20 13:53:27 +00:00
df2b0c4e79 Merge branch 'develop' into feature/distil
* develop:
  Missing conjugate in MooeeInvDag
  Allow subspace setup to no converge
  fp16 mandatory. Use SFW is not available as hdw
2019-12-20 13:24:59 +00:00
9cfd64c604 Coarse grid on GPU, not fast enough yet. Need a 10x 2019-12-17 05:24:45 -05:00
e478404291 Tuned up significantly on GPU, but another 10x in coarse space required 2019-12-17 05:03:25 -05:00
9aafd20468 Simple block project promote runs faster on GPU 2019-12-17 05:01:39 -05:00
5d834486c9 Merge pull request #259 from grid-test-organisation/feature/5d-improvement-fix
Missing conjugate in MooeeInvDag
2019-12-16 04:20:37 -05:00
f7373e97a4 Missing conjugate in MooeeInvDag 2019-12-16 10:05:50 +01:00
9e15474999 Accelerator loop attempt at speed up 2019-12-14 05:28:16 -05:00
152b525a4d Typo fix 2019-12-13 22:44:42 -05:00
d18994eddc offload more of mgrid to GPU 2019-12-13 22:08:11 -05:00
b8bd8cd2ae Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-13 21:32:10 -05:00
736b19485e Faster set up and some dead code ifdef'ed out 2019-12-13 21:30:48 -05:00
c7637a84ad Documentation tweak for peculiarities of OpenMPI --prefix 2019-12-12 17:00:03 +00:00
a7772c827b Documentation tweak 2019-12-12 16:05:22 +00:00
8e83398861 Merge pull request #257 from AndrewYongZhenNing/develop
Added NamedTensor.hpp
2019-12-11 21:36:59 +00:00
843ca9350a Fix naming conventions to be consistent with Peter 2019-12-11 11:46:18 -05:00
f47b2b6e13 Added NamedTensor.hpp 2019-12-11 15:56:46 +00:00
5bfd1470ad Merge branch 'develop' into feature/hdcr 2019-12-10 21:51:06 -05:00
6957b0b58a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-10 21:50:42 -05:00
d73f0b8618 Verbose for temporary debug 2019-12-10 21:50:06 -05:00
0b3a3562c3 Some MPI (summit) create sigusr2, so trap that 2019-12-10 21:49:12 -05:00
710fee5d26 Subspace setup testing code
and timing verbose
2019-12-10 21:48:42 -05:00
bab0bf2e93 Merge branch 'develop' into feature/hdcr 2019-12-10 21:47:41 -05:00
848079e8ba Merge pull request #235 from grid-test-organisation/feature/5d-improvement
MooeeInv and M5D optimisations + enable threading with nvcc
2019-12-10 21:45:03 -05:00
f2a4f13111 Must offload the Coarsened matrix if Stencil buffers are device resident 2019-12-10 19:32:12 -05:00
4180a4a8a7 Import BiCGSTAB solvers and tests 2019-12-10 17:20:35 -05:00
b9b9fcbfa0 Merge pull request #229 from nils-asmussen/feature/JacobiSmear
MSource::jacobi smear + sort file contents of Modules.hpp and modules.inc
2019-12-09 22:50:02 +00:00
bbe48998a8 sort Modules.hpp and modules.inc + add module JacobiSmear 2019-12-09 18:06:29 +00:00
6446671a9c Merge pull request #241 from nils-asmussen/fix/remQCDns_ignore_ws
Undo whitespace changes in fix/removeQCDremnants to allow comparing relevant changes
2019-12-09 18:02:21 +00:00
110373ea79 Merge pull request #204 from nils-asmussen/sha256sum_Eigen_download
bootstrap.sh: verify checksum of Eigen tar file
2019-12-09 18:01:46 +00:00
a986786192 bootstrap.sh: verify checksum of Eigen tar file if sha256sum is installed 2019-12-09 17:11:21 +00:00
edd1c924eb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-09 03:53:01 -05:00
9b6b0caa55 Junk commit fix 2019-12-09 03:01:58 -05:00
2a48617ac5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-09 03:00:00 -05:00
876d9c957c QMR 2019-12-09 02:59:49 -05:00
295e535f93 QMR 2019-12-09 02:59:35 -05:00
58a31f0763 QMR implemented, preserve even if not used much 2019-12-09 02:59:13 -05:00
3d2fe80780 Temporary size depends on checkerboard/uncheckerboard. The Mdir cares 2019-12-09 02:58:24 -05:00
e43fce1083 Clean up and simplify a little. 2019-12-09 02:55:45 -05:00
0dfdf80407 Logging 2019-12-09 02:54:52 -05:00
2912071f83 Add non hermitian operator 2019-12-09 02:51:53 -05:00
26605ef387 HDCR back to working 2019-12-09 02:51:01 -05:00
1e5ac576d9 Merge commit 'f7698b93ca57ea3aa4d72b133ad9ca5d1e703661' into develop
# Conflicts:
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-12-06 11:59:21 +00:00
d5492b426f Hadrons: better order in module list 2019-12-06 11:56:26 +00:00
d428858c9d Merge pull request #255 from fionnoh/feature/sparseNoise
Feature/sparse noise
2019-12-06 11:43:27 +00:00
f7698b93ca corrected comments about quark line directions 2019-12-06 09:46:52 +00:00
7ce77690b8 Naming conventon also applied to metadata 2019-12-05 17:38:43 +00:00
164ed9c434 Naming conventon also applied to metadata 2019-12-05 17:38:00 +00:00
a54157e682 more definitions changed 2019-12-05 17:08:09 +00:00
58b6a0d8d1 changed some naming conditions to resemble rare-kaons 2019-12-05 16:56:54 +00:00
1a5e562bde only one FIMPL left! 2019-12-05 16:46:58 +00:00
45be26cf3f Merge branch 'develop' of https://github.com/fionnoh/Grid into feature/sparseNoise 2019-12-05 16:18:47 +00:00
5227ffccb7 Added James' sparse noise code and a module to use it 2019-12-05 15:50:03 +00:00
a0b47cc0be Merge pull request #254 from fionnoh/bugfix/eigenMigration
Updated Eigen URL after migration to gitlab
2019-12-05 15:26:38 +00:00
b766038810 new syntax after merge 2019-12-04 18:08:00 +00:00
cd9fd80a5d merged in develop 2019-12-04 17:12:46 +00:00
d6100cc35a Merge pull request #253 from mmphys/feature/distil
Fix phase convention adjustment error
2019-12-04 14:58:51 +00:00
29a1530510 Updated Eigen URL after migration to gitlab 2019-12-04 13:49:22 +00:00
15119eaf03 Fix phase convention adjustment error (and make no assumptions about node layout) 2019-12-04 09:59:58 +00:00
188e12ffbb Merge pull request #249 from mmphys/feature/distil
Feature/distil
2019-12-03 18:06:00 +00:00
e940f4db7e removed unused parameter parity 2019-12-03 12:01:31 +00:00
9c7f269489 typo in fimpl4 2019-12-03 11:19:54 +00:00
07feaf9531 updated ascii-doc preamble 2019-12-03 11:17:35 +00:00
7983ff2fdd Merge branch 'develop' into feature/distil
* develop:
  Change to reporting
  NVCC timer support
  Fix nocompilee under NVCC
  --enable-summit flag
  IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could be a little faster
  Sliced propagator contraction was not producing any results because buf.size()=0
  several typos in hadrons
2019-11-30 16:47:03 +00:00
2db814f2b7 Resolve conflicts in BaryonUtils (just use latest from develop) 2019-11-29 18:19:35 +00:00
6418f06771 Add option to save the eigenvectors of the Laplacian.
If they are saved, then metadata saved are:
solverXml	Parameters for this LapEvec module instance
OperatorXml	module type and parameters (if any) for the module that created the gauge field
2019-11-29 18:06:18 +00:00
8a5576f73c cleared up how exactly q_spec has to be defined 2019-11-28 12:35:18 +00:00
997790ad24 Allow subspace setup to no converge 2019-11-26 14:04:28 -05:00
900d6fad21 fp16 mandatory. Use SFW is not available as hdw 2019-11-26 13:26:43 -05:00
799ff0c96e speed-up 2019-11-26 15:28:47 +00:00
5fd5c25114 now two seperate functions for Eye and NonEye 2019-11-26 13:44:55 +00:00
62b3799c77 Merge pull request #251 from fionnoh/bugfix/WallWallMeson
MContraction::Meson bugfix
2019-11-26 12:46:03 +00:00
d1a89af8c9 Change to reporting 2019-11-22 10:49:10 -05:00
d91ba1f6cc NVCC timer support 2019-11-21 20:11:19 +00:00
f4d27e7090 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-11-21 20:09:31 +00:00
feb1ff3494 Fix nocompilee under NVCC 2019-11-21 20:03:39 +00:00
8ef6175acc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-11-21 15:02:21 -05:00
e4399e3ee1 --enable-summit flag 2019-11-21 15:02:10 -05:00
98ea67b636 IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could
be a little faster
2019-11-21 15:00:46 -05:00
421a4395af Sigma to Nucleon contractions 2019-11-21 17:25:37 +00:00
cf95a460a5 Sliced propagator contraction was not producing any results because buf.size()=0 2019-11-21 17:17:55 +00:00
a60e20f265 Merge pull request #250 from mmphys/hadrons-typos
several typos in hadrons
2019-11-20 17:10:08 +00:00
9261c0da89 several typos in hadrons 2019-11-20 17:06:32 +00:00
b350a24ded fixed test_distil 2019-11-18 15:29:20 +00:00
13a0db7162 Reverse changes not intended to be part of distillation release 2019-11-18 12:34:49 +00:00
18177d9709 Review changes 2019-11-18 11:59:13 +00:00
7bf42b9c0e HADRONS_ERROR 2019-11-18 10:27:35 +00:00
2d6f4e0c09 fixed issue with HADRONS_ERROR, no idea why this works 2019-11-15 13:46:47 +00:00
7f06c40107 _var -> var_ 2019-11-15 13:26:24 +00:00
9f75065205 eigen_strong_inline gone 2019-11-15 13:22:20 +00:00
271a02230e assert -> ERROR 2019-11-15 11:11:50 +00:00
b1e8b5b5ce changed default behaviour as discussed with antonin 2019-11-15 11:00:25 +00:00
25d2521d77 small stuff 2019-11-13 16:34:09 +00:00
500ef17143 beauty 2019-11-13 15:14:51 +00:00
ee9dd22643 worked on test_distil 2019-11-13 14:59:44 +00:00
a977d9901b cleanup 2019-11-13 14:52:06 +00:00
667ffb70db changed error type 2019-11-13 12:16:56 +00:00
65b3059bd7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-11-13 11:51:14 +00:00
5238808ccd No DistilVectors specified in xml no throws an error 2019-11-13 11:50:55 +00:00
8f88fee680 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  made notation DPar->dp consistent over modules
2019-11-13 11:34:10 +00:00
fcc412a1c2 Remove conditional compilation to support GPU build 2019-11-13 11:32:23 +00:00
12e415330f made notation DPar->dp consistent over modules 2019-11-13 11:21:08 +00:00
66e0811317 Attempt to fix cuda build 2019-11-13 00:02:51 +00:00
55e743aad6 Streamline 2019-11-12 23:57:28 +00:00
e2ab0d671e Implement destructors 2019-11-12 23:18:37 +00:00
7a4c5dbbd5 Restoring previous version for _reduced variables 2019-11-12 22:12:35 +00:00
3f00b8f6c7 Switch to std::unique_ptr<GridCartesian> grid3d;
Remove hand-coded reference to pi - switch to <math.h> definition
2019-11-12 21:53:09 +00:00
6d7043e0c2 NamedTensor changes done 2019-11-12 17:31:42 +00:00
b0f24ec302 Test works now 2019-11-12 15:14:13 +00:00
fb2834bf82 Oops 2019-11-12 14:01:20 +00:00
78f75b0e9f Better than graffiti 2019-11-12 14:00:46 +00:00
62dd0bfe58 New parameter module compiles. Untested. 2019-11-12 13:59:53 +00:00
db952993fa envCreate problem.. 2019-11-12 12:23:34 +00:00
b8f0878981 removed most default behaviour 2019-11-11 17:49:38 +00:00
df586a142d added DistilPar-module and cleaned up some code 2019-11-11 17:29:55 +00:00
7a446d5b7f removed default filenames 2019-11-11 14:36:45 +00:00
e7d7ea4f8f added LoadNoise module 2019-11-11 12:55:45 +00:00
f8e1941327 Implemented specialisations of NamedTensor as derived classes, however this suffers a number of problems:
1) virtual functions not available in base class constructor where I'd like to use them - e.g. IndexNames
2) Must define new constructors in derived classes
... so the specialisations are fatter than I'd like. Would prefer to revert to specifying tensor name and index name defaults in template
2019-11-08 11:55:00 +00:00
65aa54804e added comments 2019-11-08 11:15:51 +00:00
293bfe17d1 added code to the noise module... 2019-11-07 14:00:40 +00:00
a8f3a111a5 added Serial RNG - code compiles but not tested! 2019-11-07 13:45:38 +00:00
5c23abe507 commented on Notation 2019-11-07 11:57:40 +00:00
22c654182a Fixes for GPU compile 2019-11-04 17:24:34 +00:00
6f0439c0e4 Remove unnecessary cast 2019-11-04 15:50:14 +00:00
4f9a7c5d76 Back out unnecessary change 2019-11-02 16:50:29 +00:00
fcd90705bc Beautification 2019-11-02 16:15:48 +00:00
4bcdb4ff95 Remove accidental check-in of local debugging 2019-11-02 15:24:12 +00:00
1c10933db1 Rationalisation of NamedTensor (Perambulator) 2019-11-02 14:58:32 +00:00
52d8d576d0 Removed SliceShare as a reusable routine 2019-11-01 20:10:51 +00:00
ada0a7a83b C++11 case comparison of named tensor index names 2019-11-01 16:05:08 +00:00
efe2f2d48b Merge branch 'develop' into feature/distil
* develop:
  Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun
  Fixed Lanczos calling aligned alloc in threaded region hitting up against pointer-cache no-threading restrictions Fixed Lattice::reset not compiling with new Grid explicit memory region handling Fixed memory leak in Lattice::resize that occurs when data region has been previously allocated
2019-11-01 15:38:48 +00:00
45d4cf0971 Cleanup in progress 2019-11-01 15:35:07 +00:00
ac614cbc53 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-10-31 11:46:43 -04:00
ec8e060ec7 Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun 2019-10-31 11:46:09 -04:00
5c54f27ac1 some cleanup, but hard-coded src in LapEvec unclear 2019-10-31 11:51:05 +00:00
4ed9379535 some cleanup 2019-10-31 11:45:50 +00:00
858e348a6d Cleanup of messages 2019-10-31 11:11:52 +00:00
3b3680c64e Reversed Felix's interim A2Autils.h changes ... these were finished and went into develop via a separate branch 2019-10-30 15:50:04 +00:00
2a926b3dc6 Merged latest changes from develop, in preparation for release. 2019-10-30 14:52:34 +00:00
845a045493 Merge pull request #233 from giltirn/lanczos_fix
A few run /compile / memory leak fixes
2019-10-30 10:21:59 -04:00
eb8848a071 Merge branch 'develop' into feature/distil
* develop: (27 commits)
  Update README.md
  result layout standardised, iterator size more elegant
  updated syntac in Test_hadrons_spectrum
  chroma-regression test now prints difference correctly
  baryon input strings are now pairs of pairs of gammas - still ugly!!
  second update to pull request
  Changing back interface for Gamma3pt
  Removing old debug code
  Changes to A2Autils
  suggested changes for 1st pull request implemented
  changed input parameters for easier use
  Should compile everywhere now
  changed baryon interface
  added author information
  ready for pull request
  code compiling now - still need to test
  Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!!
  thread_for caused the problems - slow for loop for now
  still bugfix
  weird bug...
  ...

# Conflicts:
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-10-30 14:13:00 +00:00
f31e3278a6 Update README.md 2019-10-25 11:43:55 -04:00
ca234325bc Fix single-precision error 2019-10-23 21:49:32 +01:00
c97f780784 Merge pull request #243 from fionnoh/feature/A2A_current_insertion
Feature/a2 a current insertion
2019-10-22 13:55:53 +01:00
78bdb0ff6a Grid 2019-10-20 14:22:45 +01:00
decab587a0 PerambFileName defaults to object name if empty 2019-10-20 14:14:06 +01:00
202f025fc7 Merge pull request #242 from mmphys/feature/baryons
Feature/baryons
2019-10-16 15:06:32 +01:00
3c702b510b result layout standardised, iterator size more elegant 2019-10-15 18:48:51 +01:00
519ce19128 Fixes to enable GPU build. NB: Contractor and ContractorBenchmark still not working 2019-10-14 22:40:13 +01:00
8d166a81c0 updated syntac in Test_hadrons_spectrum 2019-10-14 13:41:08 +01:00
aa62ca9046 chroma-regression test now prints difference correctly 2019-10-10 11:07:20 +01:00
2dee4791db baryon input strings are now pairs of pairs of gammas - still ugly!! 2019-10-09 17:56:09 +01:00
548b3bf43c second update to pull request 2019-10-09 14:52:33 +01:00
a55d0ba8fe Changing back interface for Gamma3pt 2019-10-08 15:52:01 +01:00
5de9547db5 Removing old debug code 2019-10-08 15:51:28 +01:00
6a3b09cf02 Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion 2019-10-08 13:25:51 +01:00
10de4bfc23 Changes to A2Autils 2019-10-08 13:24:56 +01:00
2ce7f2b4d8 suggested changes for 1st pull request implemented 2019-10-08 13:19:47 +01:00
88d6ff8f1d Peter's bugfix in ImplicitlyRestartedLanczos.h
My bugfix in MomentumPhase.hpp
2019-10-07 17:36:11 +01:00
803329af99 Merge branch 'develop' into feature/distil
* develop:
  Fix after GPU merge: Phase in Free Propagator
  z2-momentum phase module

# Conflicts:
#	Hadrons/Modules/MSource/MomentumPhase.hpp
2019-10-07 13:09:52 +01:00
9d96899aa8 Doc bugfix 2019-10-07 13:05:04 +01:00
86939dbf1a Removed unnecessary function (for getting a parameter) 2019-10-04 13:59:59 +01:00
317645aaeb undo (most) whitespace changes in the two files HMC/Mobius2p1fEOFA{,_F1}.cc 2019-10-02 16:25:23 +01:00
e280ec6b0b changed input parameters for easier use 2019-10-02 16:14:06 +01:00
d5a180d914 Merge branch 'fix/removeQCDremnants' into fix/remQCDns_ignore_ws 2019-10-02 16:11:27 +01:00
d2928761dd Merge pull request #240 from guelpers/feature/bugfixafterGPUmerge
Fix after GPU merge: Phase in Free Propagator
2019-10-02 15:00:15 +01:00
f2a74c603f Merge pull request #239 from mmphys/z2_momentum
z2-momentum phase module
2019-10-02 14:57:59 +01:00
5f22810f55 Fix after GPU merge: Phase in Free Propagator 2019-10-02 14:49:35 +01:00
92e25488f8 Added MomentumPhase Hadrons module from z2_momentum branch (thankyou, Felix) so I can run Z_2 wall with momenta easily 2019-10-02 14:13:35 +01:00
89ef2b7dc2 Should compile everywhere now 2019-10-02 13:20:07 +01:00
7606554b76 Remove references to unused modules (now part of separate Baryons branch) 2019-10-02 13:16:58 +01:00
c8fc0b3e0c changed baryon interface 2019-10-02 11:36:39 +01:00
ccb5e8374b z2-momentum phase module 2019-09-30 17:36:15 +01:00
b88fd436e7 added author information 2019-09-30 17:07:46 +01:00
155bcd4ff3 ready for pull request 2019-09-30 16:58:20 +01:00
d1daab601a Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion
Peter's GPU branch changes merged with A2A CI code
2019-09-30 16:53:44 +01:00
e5d7910fa7 code compiling now - still need to test 2019-09-30 13:55:26 +01:00
94b9a9474c Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!! 2019-09-27 15:08:56 +01:00
bf62ec163d thread_for caused the problems - slow for loop for now 2019-09-26 13:33:49 +01:00
8415e23fc6 still bugfix 2019-09-26 11:09:09 +01:00
76c93aa44e weird bug... 2019-09-17 14:36:26 +01:00
3137628222 BaryonUtils.h is now part of Baryons 2019-09-17 13:19:20 +01:00
ce965ee6bb Cleanup tests that are no longer required 2019-09-17 13:10:59 +01:00
911fbb0f36 Cleanup modules that are no longer required 2019-09-17 13:06:52 +01:00
eb293e9909 Restore Baryons modules per develop branch 2019-09-16 20:29:37 +01:00
f548114ff6 bugfix 2019-09-16 17:55:58 +01:00
dab8c01c3d added Baryon code 2019-09-16 17:20:54 +01:00
2f3dd0703d Ensure Distillation test (Test_distil) works 2019-09-16 17:00:46 +01:00
2e963d1a78 Fix location of Grid.h and remove reference to QCD namespace 2019-09-16 15:34:47 +01:00
bf52e7cc96 Latest BaryonUtils.h from Felix + my fixes 2019-09-13 18:11:10 +01:00
61d017d0a5 Merge GPU support (upstream/develop) into distillation branch.
This compiles and looks right ... but may need some testing

* develop: (762 commits)
  Tensor ambiguous fix
  Fix for GCC preprocessor/pragma handling bug
  Trips up NVCC for reasons I dont understand on summit
  Fix GCC complaint
  Zero() change
  Force a couple of things to compile on NVCC
  Remove debug code
  nvcc error suppress
  Merge develop
  Reduction finished and hopefully fixes CI regression fail on single precisoin and force
  Double precision variants for summation accuracy
  Update todo list
  Freeze the seed
  Fix compiling of MSource::Gauss for single precision
  Think the reduction is now sorted and cleaned up
  Fix force term
  Printing improvement
  GPU reduction fix and also exit backtrace option
  GPU friendly
  Simplify the comms benchmark
  ...

# Conflicts:
#	Grid/communicator/SharedMemoryMPI.cc
#	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
#	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
#	Grid/qcd/smearing/StoutSmearing.h
#	Hadrons/Modules.hpp
#	Hadrons/Utilities/Contractor.cc
#	Hadrons/modules.inc
#	tests/forces/Test_dwf_force_eofa.cc
#	tests/forces/Test_dwf_gpforce_eofa.cc
2019-09-13 13:30:00 +01:00
04a661cafe Remove unused modules BC2 and Baryon2 2019-09-10 14:49:24 +01:00
a7fa86dc29 MooeeInv improvement for DW EOFA + comments 2019-09-05 12:05:21 +01:00
0c1efa5235 pass OpenMP flag to host compiler 2019-09-03 12:12:25 +01:00
fdd9b14e82 speed up MooeeInvDag for DWF EOFA 2019-09-02 14:49:51 +01:00
e66669d300 fast MooeeInv for EOFA 2019-09-02 14:26:13 +01:00
0efaf3c4fa access M5D coeffs through pointers 2019-09-02 11:33:00 +01:00
3ef519aaa4 fast MooeeInv 2019-09-02 11:18:14 +01:00
b473405652 Tensor ambiguous fix 2019-08-29 09:36:41 -05:00
114ebb7914 Fixed Lanczos calling aligned alloc in threaded region hitting up against pointer-cache no-threading restrictions
Fixed Lattice::reset not compiling with new Grid explicit memory region handling
Fixed memory leak in Lattice::resize that occurs when data region has been previously allocated
2019-08-26 16:47:44 -04:00
9b7a6d197f Fix for GCC preprocessor/pragma handling bug 2019-08-23 14:37:46 +01:00
59cd7f3b70 Trips up NVCC for reasons I dont understand on summit 2019-08-23 06:03:49 -04:00
28d6be2a4e Fix GCC complaint 2019-08-22 18:56:37 +01:00
6b6c5aa626 remove namespace QCD from directory tests 2019-08-20 15:35:36 +01:00
9210b0aa6e remove namespace QCD from directory HMC 2019-08-20 15:21:23 +01:00
ad01290545 remove remnants of the namespace QCD 2019-08-19 20:30:33 +01:00
25150eb2e0 3pt contraction now takes a list of gammas 2019-08-15 12:09:30 +01:00
95f66cc93c Merge branch 'feature/gpu-port' into develop 2019-08-15 02:19:31 +01:00
12eb2a6a34 Zero() change 2019-08-15 01:43:00 +01:00
7c8902b04f Merge branch 'develop' into feature/gpu-port 2019-08-15 01:33:07 +01:00
4278caa030 Force a couple of things to compile on NVCC 2019-08-15 01:32:03 +01:00
be37dfb6f8 Remove debug code 2019-08-15 01:31:40 +01:00
5e8437029f nvcc error suppress 2019-08-15 01:31:12 +01:00
e279b2be29 Merge develop 2019-08-14 23:01:59 +01:00
48e6efc7c9 Merge branch 'develop' into feature/gpu-port
Conflicts:
	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
	Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
	benchmarks/Benchmark_comms.cc
2019-08-14 18:56:54 +01:00
55c095f620 Merge pull request #226 from nils-asmussen/fix/Gauss
Fix compiling of MSource::Gauss for single precision
2019-08-14 17:50:38 +01:00
3e49dc8a67 Reduction finished and hopefully fixes CI regression fail on single precisoin and force 2019-08-14 15:18:34 +01:00
96ac56cace Double precision variants for summation accuracy 2019-08-14 13:08:01 +01:00
2b037e3daa Update todo list 2019-08-14 13:07:26 +01:00
2d2de7aede Freeze the seed 2019-08-14 13:07:11 +01:00
e3966aa49b Fix compiling of MSource::Gauss for single precision 2019-08-12 14:57:11 +01:00
ce97638bac Think the reduction is now sorted and cleaned up 2019-08-11 11:09:01 +01:00
53e3ab4131 Fix force term 2019-08-11 11:06:13 +01:00
c2c4252a07 Merge pull request #216 from nils-asmussen/feature/GaussianSmearing
feature/gaussian smearing
2019-08-08 12:29:55 +02:00
d566637cec Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion 2019-08-07 12:11:40 +01:00
51bed48cd2 added selfcontract module 2019-08-05 17:46:42 +01:00
b875edceab Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil
Conflicts:
	Grid/qcd/utils/BaryonUtils.h
	Hadrons/Modules/MContraction/Baryon2.hpp
2019-08-05 14:19:43 +01:00
29df60c0cb some debugging stuff 2019-08-05 14:10:04 +01:00
8d97e2a02a Say which A2AMatrix is being loaded, and which contraction is being performed (m of n) 2019-08-02 19:23:18 +01:00
ed23f6be20 Remove blank line from log 2019-08-02 15:59:18 +01:00
cad76827b0 Be consistent about separator usage. Log start / stop / duration 2019-08-02 15:47:20 +01:00
310867d46a Additional option to specify the separator used between terms in correlator 2019-08-02 11:25:29 +01:00
e598178d94 TODO: Felix, please fix. I commented this out because of compiler errors 2019-08-01 20:51:51 +01:00
723457d467 Contractor updates ready for test on Tesseract:
1) Move definitions of serialisable objects into header for re-use by external programs/utilities
2) Add "-s" switch for "Simple" correlators, i.e. only include A2AMatrix info for the actual fields included in each contraction
2019-08-01 20:35:55 +01:00
6f40021842 Fixed compiler errors: TODO: Felix, please validate 2019-08-01 19:57:59 +01:00
9cd33a7b9c Printing improvement 2019-07-31 08:01:24 +01:00
639dc1ab21 GPU reduction fix and also exit backtrace option 2019-07-31 01:23:23 +01:00
9117f61109 GPU friendly 2019-07-31 01:22:54 +01:00
bca36d9bc3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-07-30 22:51:23 +01:00
263dcbabab Simplify the comms benchmark 2019-07-30 22:51:04 +01:00
622d5eaa3e Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-07-30 13:47:22 +01:00
e66d48c142 second way to compute baryons - qdp style 2019-07-30 13:46:59 +01:00
9dad7a0094 Reproducible reduction and axpy_norm offload from Gianluca.
Hopefully get CG running entirely on GPU
2019-07-30 00:14:12 +01:00
8c6016f717 Merge pull request #219 from mmphys/feature/include
Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h>
2019-07-29 23:08:01 +01:00
1282e1067f Do the force term on the accelerator too. Needed particularly because comms buffers
are device memory.
2019-07-29 22:58:35 +01:00
f5ad4f3de8 Added the ability to write a version of the validated XML file excluding any of the module IDs supplied in a separate exclude file 2019-07-26 19:46:55 +01:00
275c1c920f More info dump on error from CUDA 2019-07-26 12:18:53 +01:00
fe700a183a Getting HMC to run 2019-07-26 12:18:29 +01:00
34108296cd Merge branch 'develop' into feature/gpu-port
Conflicts:
	Grid/simd/Grid_avx512.h
2019-07-20 17:05:35 +01:00
76c704b84b Intrinsics for CLANG are now fixed in v6 2019-07-20 16:52:24 +01:00
ce255ec359 Relocate to fix build failure for comms none 2019-07-20 16:37:03 +01:00
1c096626cb Hypercube defaults to on if HPE detected, but override to off possible 2019-07-20 16:06:16 +01:00
ce8b247426 Compiles 2019-07-20 15:16:02 +01:00
80481f81be Constructor typo 2019-07-20 09:58:24 +01:00
d85dcc72df Multinode fix 2019-07-20 07:13:28 +01:00
3fedcd6d52 Compiles 2019-07-20 07:12:44 +01:00
e7050a7aed Support gamma structure names that have trailing white space 2019-07-19 11:58:56 +01:00
e138bc7204 debug output 2019-07-19 11:16:35 +01:00
25ba4c5f80 Merge branch 'develop' into feature/gpu-port
Conflicts:
	HMC/Mobius2p1fEOFA.cc
	tests/forces/Test_rect_force.cc
2019-07-19 11:01:55 +01:00
671bcbcccb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-07-19 10:48:22 +01:00
ff325376cb Fix single precision deriv test fail 2019-07-19 10:47:44 +01:00
6d4fb35d84 Ready for testing 2019-07-19 10:33:03 +01:00
9e926e3fc5 Build fix in develop 2019-07-19 10:01:52 +01:00
775eaee199 Fix for suspected Intel 2018.1 compiler bug under O3 2019-07-19 07:57:34 +01:00
0fd2827d5d Fix fail in single 2019-07-19 05:28:26 +01:00
bdd79f9ef8 TODO update 2019-07-18 22:04:28 +01:00
0695f8cec2 Single precision compile fix. Soon deprecate single precision 2019-07-18 22:02:31 +01:00
9fa705c5a0 comma fix 2019-07-18 21:38:11 +01:00
56cefadf9b gamma matrices as input 2019-07-18 17:46:43 +01:00
9d82855c5d bugfix in Baryonutils 2019-07-18 15:45:43 +01:00
97d61f2564 bugfix in Baryonutils 2019-07-18 14:57:10 +01:00
331f5a53dc New header 2019-07-18 14:51:09 +01:00
a23dc295ac Remove compiler errors and warnings 2019-07-18 14:47:02 +01:00
11a8668d19 bugfix in Baryonutils 2019-07-18 14:44:55 +01:00
cded7670d0 new utils for baryons 2019-07-18 14:29:04 +01:00
feb029fb66 new utils for baryons 2019-07-18 14:24:16 +01:00
08904f830e Merge develop 2019-07-16 11:59:56 +01:00
fa9cd50c5b Merge branch 'develop' into feature/gpu-port 2019-07-16 11:55:17 +01:00
5a62ebe7b1 general baryons case added 2019-07-15 15:26:30 +01:00
7c11525d1a Local stencil for complex wilson loops etc 2019-07-14 14:05:09 +01:00
42c1dbb1d1 General local stencil first cut for Patrick force term 2019-07-14 14:04:28 +01:00
6179acfda0 Put back a call that was required 2019-07-14 13:59:54 +01:00
fa747173d1 Debugging references were to l-values, so added const to stop errors 2019-07-14 11:08:00 +01:00
07601ac1f5 Replace instantiation of Gparity 2019-07-12 17:18:12 +01:00
705a8098b2 Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port
Conflicts:
	Grid/stencil/Stencil.h
2019-07-12 17:14:11 +01:00
a29b43d755 Stencil comms cleaner 2019-07-12 17:12:25 +01:00
368c8369ce Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2019-07-12 17:11:29 +01:00
c0d89a2dbb TODO updates 2019-07-12 17:11:15 +01:00
78ebd93281 Cuda 9.1 happy 2019-07-12 17:11:00 +01:00
3d58daf70f Safety check 2019-07-12 17:10:35 +01:00
bd155ca5c0 Overlap comms with comput now supported 2019-07-12 09:09:40 +01:00
91e2cf9b40 All axes can be used for comms now 2019-07-12 09:08:26 +01:00
3cc9947731 Better welcome printing 2019-07-12 06:47:51 +01:00
f15eeb0283 localise scope of variables declared in macro 2019-07-12 06:47:01 +01:00
0996ba9396 Pretty messaging 2019-07-12 06:45:31 +01:00
12afb0395f Debugging transposeSpin - seems just not to be implemented for Lattice<x> 2019-07-11 17:42:26 +01:00
ec4aa978ab why cant I spinTranspose 2019-07-11 14:01:41 +01:00
966a203dcb Interactions with GPU compilation 2019-07-11 03:16:17 +01:00
44170cc15f Initialise CUDA device prior to entering MPI.
This may or may not interact with Summit which configures MPI - CUDA mapping with jsrun.
TBD
Cases of OpenMPI and MVAPICH are covered, and default to cudaSetDevice(0) otherwise
2019-07-11 03:14:23 +01:00
7bc4a06f3f This is probably what you want ... 2019-07-10 12:29:33 +01:00
cd659525e1 You probably want to add this to the build. And you may need to do a bootstrap 2019-07-10 12:08:37 +01:00
dc2240d2d8 why does sliceSum in Nucleon.hpp not work 2019-07-10 11:34:16 +01:00
98cf20cf06 continued work on baryons 2019-07-09 17:42:36 +01:00
cc3346073e continued work on baryons 2019-07-09 17:30:32 +01:00
3848da7c50 added nucleon module (non-distillation) 2019-07-08 17:43:14 +01:00
c3d0c176ab cleaning up Kl2 contraction 2019-07-05 16:29:46 +01:00
0a71f8bb10 Merge pull request #222 from guelpers/feature/kl2QEDseq
EMLepton: Multiple source-sink separations at once
2019-07-05 16:22:34 +01:00
b7d0cf6751 buxfix in diquark sum / baryons 2019-07-04 22:06:37 +01:00
3a31ba2ea2 Merge remote-tracking branch 'upstream/develop' into feature/kl2QEDseq 2019-07-03 14:37:56 +01:00
eac6337466 Hadrons: EMLepton: multiple source-sink separations at once 2019-07-03 14:36:34 +01:00
ab7537e002 Merge pull request #221 from fionnoh/bugfix/A2ALoop
Bugfix for A2ALoop module
2019-07-03 14:13:51 +01:00
2c1a077369 continued on baryons 2019-07-02 17:55:28 +01:00
6e3c3214a3 Offload loops 2019-07-02 17:25:40 +01:00
d6ffadb33b Coalesced write 2019-07-02 17:25:13 +01:00
ae3abbe53d Added the ability for Perambulator module to save unsmeared sinks through the addition of two optional parameters:
UnsmearedSinkFileName: If present, specifies the filename to write to
UnsmearedSinkMultiFile: defaults to true to write each sink vector to a different file, but can be set to 0 for a single file
2019-07-01 17:28:27 +01:00
5fc0188205 started saving sinks 2019-07-01 14:51:59 +01:00
4c3225412b Drop 5dVEC 2019-07-01 07:31:26 +01:00
b8f7bfbb26 Dont stream as poor perf in some cases 2019-07-01 07:30:25 +01:00
7b7c470917 Accelerator loop 2019-07-01 07:29:51 +01:00
532e226b22 cuda 9.1 fixes 2019-07-01 07:29:22 +01:00
6a13731818 Move GPU cuda call earlier 2019-07-01 07:28:41 +01:00
67690df3bd Changes nedded to have a current insertion on every second time slice - avoids unnecessary contractions 2019-06-28 15:18:28 +08:00
1059189abf Bugfix for A2ALoop module 2019-06-27 13:49:55 +08:00
ce29b18dc9 New modules for loading in MFs as diskvectors and producing propagaotrs from 4 quark contractions 2019-06-27 13:46:06 +08:00
421a0a8a36 Changes to A2Autils, A2AMatirx and DiskVector code that is needed for Hadrons 4 quark contraction module 2019-06-27 13:45:20 +08:00
ac530636ca A2Aloop bugfix 2019-06-27 13:44:47 +08:00
2d940a598c Inserted four extra parameters just to make this test compile. Needs to be fixed properly 2019-06-19 10:37:50 +01:00
c28c5fc61b Inserted four extra parameters just to make this test compile. Needs to be fixed properly 2019-06-19 10:31:41 +01:00
015340d60c Elided superfluous copy on write 2019-06-19 09:37:03 +01:00
1cd4ee0706 Thrust used on GPU builds 2019-06-18 12:50:35 +01:00
b8f71b6777 Fix NVCC warning unused variable 2019-06-17 13:58:45 +01:00
703dc20377 Compile tests fix 2019-06-16 13:59:29 +01:00
d976e5c514 Pow is being awkward in thrust for reasons I don't understand. Possible thrust bug. 2019-06-16 12:05:11 +01:00
d7b3efe893 Compile fix 2019-06-15 17:03:15 +01:00
f710d7bd45 TODO list update 2019-06-15 12:54:27 +01:00
cb336aa8f8 Thread loop constructs changing a little 2019-06-15 12:54:11 +01:00
462900b48d Modified entire test directory to suit new GPU constructs for looping 2019-06-15 12:53:27 +01:00
0561c2edeb Benchmarks modified for new GPU constructs 2019-06-15 12:52:56 +01:00
0184719216 Change to predicate type 2019-06-15 12:52:26 +01:00
24202dbc51 Thread loop construct change 2019-06-15 12:52:07 +01:00
d763c303c5 Clean acceleerator barrier 2019-06-15 12:51:45 +01:00
8e394d3bf9 New loop construct 2019-06-15 12:51:15 +01:00
b881d5489b Move SchurDiagTwoKappa to Algorithms 2019-06-15 12:50:45 +01:00
82306913a8 Move Schur operator into correct place 2019-06-15 12:49:22 +01:00
49f90cc7eb use pragma once 2019-06-15 12:45:22 +01:00
b77af0210b Thread loop. Probably deprecate this impl 2019-06-15 12:44:56 +01:00
5254ede2d8 New loops. Revisit as accelerator loop in future audit 2019-06-15 12:44:29 +01:00
16e5d7945e Hard to make 5D vec work with GPU code 2019-06-15 12:43:43 +01:00
decc99ca76 Accelerator version 2019-06-15 12:43:00 +01:00
464cd65931 Still to test this fully 2019-06-15 12:35:14 +01:00
a1ec2f4723 Still to test this routine fully 2019-06-15 12:33:55 +01:00
ea9662ec85 Thread loop changes 2019-06-15 09:09:57 +01:00
52c74f1cac Thread loop changes 2019-06-15 09:08:16 +01:00
9a13d2992c lean up 2019-06-15 09:05:16 +01:00
b0449ae270 Thread loop changes 2019-06-15 09:04:19 +01:00
1299225105 Accelerator loop changes 2019-06-15 09:03:46 +01:00
5925e7f405 Thread for changes 2019-06-15 09:01:30 +01:00
be1fd4930f Template instantiation make happy changes 2019-06-15 08:37:34 +01:00
377fa5dec1 looping construct 2019-06-15 08:36:48 +01:00
e8b78f596e Looping construct changes 2019-06-15 08:35:57 +01:00
09720c40cd Coalesced loops 2019-06-15 08:35:26 +01:00
bb024dd114 Loop construct changed 2019-06-15 08:30:05 +01:00
52456b9ec7 New loop construct 2019-06-15 08:28:45 +01:00
b285138be4 Better checking on types 2019-06-15 08:27:48 +01:00
c7dbf4c87e Scalar support for GPU threads 2019-06-15 08:25:43 +01:00
1e889c93b8 Insert a GPU synchronise 2019-06-15 08:23:26 +01:00
7379047482 Threading and acceleration primitives further changes. accelerator_barrier() needed and used 2019-06-15 08:22:48 +01:00
d836ce3b78 Clean up of acceleration and threading primitives 2019-06-15 08:14:21 +01:00
cefaacbc07 Changing accelerator loop. Still have work to do for multi-GPU code 2019-06-15 08:10:24 +01:00
0074ef7f69 thread loops 2019-06-15 08:04:29 +01:00
20359ca15f Coalesced loops. 2019-06-15 08:03:57 +01:00
736358b0cb Coalesced loops 2019-06-15 08:03:13 +01:00
6b692aa726 Thread loops 2019-06-15 08:02:26 +01:00
7f99e1cd3b Coalesced loops 2019-06-15 08:01:39 +01:00
f3c89df948 Thread loop changes 2019-06-15 08:00:37 +01:00
b7e6d111d7 Thread loop changes. Need to offload this file 2019-06-15 07:59:10 +01:00
f39cf69c33 Accelerator loop change 2019-06-15 07:58:23 +01:00
8e27338df2 Rationalise number of loop macros 2019-06-15 07:57:40 +01:00
bcbb5e9d26 Remove assembly tests 2019-06-15 07:57:05 +01:00
0ea7f5279d Accelerator loop changes 2019-06-15 07:56:14 +01:00
18e5de426d There is a stray use of predicatedWhere introduced by Andrew Lawson in the conserve currents.
The conserved currents need rewritten using data parallel operations.
2019-06-15 07:53:58 +01:00
e896d81235 Accelerator loop redefine. Coalesce most accesses, but ET engine still to go clean. 2019-06-15 07:52:44 +01:00
7b8ccff4f4 Accelerated coalesced loops in most cases 2019-06-15 07:48:00 +01:00
68541606ab Thread loop changes. Soon try these with accelerator loops and benchmark 2019-06-15 07:46:42 +01:00
339ea10cc7 First touch only on CPU code 2019-06-15 07:45:43 +01:00
d0d8dc8042 Thread loop changes 2019-06-15 07:45:09 +01:00
81eb1fd9f2 Accelerator loop changes for coalesced access 2019-06-15 07:44:47 +01:00
cb93d32cd9 Thread loop changes 2019-06-15 07:44:08 +01:00
8f223962ff Thread loop changed 2019-06-15 07:43:42 +01:00
9a8a63467e BC2 now runs. setup() runs twice, which had resulted in doubling up of momenta. Also fixed initialisation of momentum phases. 2019-06-12 15:25:59 +01:00
36f06555a2 Simplify Impl 2019-06-09 22:26:27 +01:00
d6c0e0756d Remove GPU version 2019-06-09 11:23:42 +01:00
3e41b1055c Remove Gpu only kernels. 2019-06-09 11:20:01 +01:00
9fbcfe612c Update TODO list 2019-06-09 11:19:38 +01:00
e78a5e7838 ASM instantiation without link errors 2019-06-09 01:25:21 +01:00
da8d87e9da Cuda switch off 2019-06-08 17:11:38 +01:00
8e3a05d89b Moving the instantiation into a cleaner structure 2019-06-08 13:48:33 +01:00
8adc5da7dd Testig out approaches to kernel writing introducing SIMT_loop temporarily 2019-06-08 13:47:04 +01:00
29a244e423 Test of using a lane variable instead of repeated reference to threadIdx.y 2019-06-08 13:46:26 +01:00
18cbfecf02 Use symlinks in find command 2019-06-08 13:45:46 +01:00
c933ac2248 Temporarily introduce a SIMT_loop to test out approaches prior to making a global change to
accelerator_loop
2019-06-08 13:44:27 +01:00
ad2c433574 Instantiations move. Tried using Gianluca's suggestion about avoiding threadIdx but doesn't
seem to make a difference. Will revisit this and probably remove the lane parameter from the coalescedRead
2019-06-08 13:43:12 +01:00
86e7fb6e86 Instantiation relocation 2019-06-08 13:42:46 +01:00
fb91dda7be Hand instantiation moved location 2019-06-08 13:42:26 +01:00
82cf7bc5ab Move instantiation into fermion/instantiation 2019-06-08 13:41:46 +01:00
e452cc0a22 Move static variables into instantiation .cc file 2019-06-08 13:41:20 +01:00
4d2b938166 Remove explict instantiation from here 2019-06-08 13:41:01 +01:00
10d16ab76c Remove explict instantiation from here 2019-06-08 13:40:32 +01:00
1f997fa484 Instantiate via explict .cc files for parallel make. 2019-06-08 13:39:51 +01:00
dc5024e88c The GPU reduction was not working for me and causing errors. Need to revisit.
Gianluca is working on deterministic reduction/
2019-06-08 13:39:11 +01:00
6d77941990 Drop the 5D vec actions 2019-06-08 13:38:05 +01:00
0ee6e77cbc Compiles GPU and CPU, still gives good performance on CPU 2019-06-05 13:28:16 +01:00
18d3cde29a Compile on GPU workd 2019-06-05 00:14:58 +01:00
7323099966 Instatiation fix 2019-06-05 00:14:38 +01:00
6379651cdd Generic or GPU ready for benchmark test on GPU 2019-06-05 00:13:52 +01:00
ba4fd756b9 Fix signature, but deprecating this loops style 2019-06-05 00:12:36 +01:00
d185fc1ebf clean up instantiation 2019-06-05 00:11:52 +01:00
96b36d8367 Instantiation clean up 2019-06-05 00:11:27 +01:00
899f8b5065 Instantiation clean up 5d vec removal 2019-06-05 00:11:05 +01:00
c8d0483fe9 Remove 5d vectorisation 2019-06-05 00:10:37 +01:00
0f214e5f76 Clean up instantiation 2019-06-05 00:10:13 +01:00
8eea568426 GPU loop ; presently differentiated with ifdef, find a way to unify. 2019-06-05 00:09:28 +01:00
9636324069 GPU happy code 2019-06-05 00:08:54 +01:00
8a5489d9e6 Move the loop into a central kernel call. 2019-06-05 00:08:13 +01:00
8113845f9c coalesce loop. Need to rationalise this file 2019-06-04 23:49:29 +01:00
b47f73c222 GPU happy 2019-06-04 21:30:39 +01:00
5720ced0fd Simplifying 2019-06-04 21:30:08 +01:00
2c87b56b53 Making GPU happier 2019-06-04 21:29:44 +01:00
dbad48d802 Remove Ls vectorised DWF 2019-06-04 21:27:40 +01:00
4557a1365a Remove Ls vectorised DWF 2019-06-04 20:59:59 +01:00
16e9b87d98 Remove Ls vectorised DWF as unused and hard to maintain 2019-06-04 20:59:01 +01:00
685eea3d0f Small cosmetic 2019-06-04 20:58:14 +01:00
65b48831fb Simplify code 2019-06-04 20:56:30 +01:00
57396fc595 Simplify code 2019-06-04 20:56:23 +01:00
a2e199df50 Simplifying Cayley cases. 2019-06-04 20:54:52 +01:00
020346c848 WOrk list. Will have to clean up Fermion sector. 2019-06-04 20:54:00 +01:00
c2625a127e Non blocking loop. Want to change the naming here. 2019-06-04 20:52:59 +01:00
8794d35c78 GPU 2019-06-04 20:52:27 +01:00
24bff6dbe6 Minor improvements 2019-06-04 20:51:48 +01:00
45b15d10d3 GPU happy changes 2019-06-04 20:49:16 +01:00
33d6bbe32b GPU must use accelerator vectors 2019-06-04 20:48:52 +01:00
7a1569bd46 Annoying, cannot rely on equivalence of Grid ComplexD adn Eigen Complex type on GPU.
Solve with ComplexD typecasts but must be a better way
2019-06-04 20:47:49 +01:00
6e2e904a0e NVCC compiles happy. Start to develop strategy for writing generic
code for GPU kernels and CPU kernels.
2019-06-04 20:46:35 +01:00
d92a17f359 Suppress NVCC warnings in pugixml with pragma 2019-06-04 20:45:53 +01:00
47c063f984 Remove Ls Vec cases from benchmarks 2019-06-04 20:45:35 +01:00
7e27a5213a Tests builds clean. 2019-06-04 20:45:20 +01:00
fe72dc099b Upgrade to Mojave forced me to reinstall MacPorts. These are the ports I installed to get Grid working 2019-06-04 16:12:24 +01:00
ade4a126da Getting closer on the GPU port, but will start deleting 5th dim vectorised variants
for code maintainability
2019-06-04 11:53:44 +01:00
7b59ab5bd7 Compiling after reorganisation 2019-06-03 15:46:26 +01:00
fcd8cfe257 Gparity in 2019-06-03 15:45:09 +01:00
b4b53812cb Move implementation to specific implementation headers 2019-06-03 15:43:01 +01:00
085cac583f Implementation in header 2019-06-03 15:42:36 +01:00
25e3b8640c Move to header 2019-06-03 15:42:05 +01:00
c81d3d422d Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h> 2019-06-03 15:25:05 +01:00
54edb9906e Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h> 2019-06-03 15:20:46 +01:00
44bbec50b0 Making GPU compile happy 2019-06-03 14:57:04 +01:00
ec68b67d5d Attempt at unified GPU and CPU kernel 2019-06-03 14:55:51 +01:00
778450e0c8 Move to implementation subdir 2019-06-03 14:53:56 +01:00
567aa5f366 Move to implementation subdir 2019-06-03 14:53:33 +01:00
2ab7e2b175 Force instantiation in .cc files.
Eventually move into multiple files
2019-06-03 14:52:59 +01:00
6f61be044d Dont instantiate in header 2019-06-03 14:52:01 +01:00
269e00509e Don't instantiate in header 2019-06-03 14:51:24 +01:00
a5e90b0ddc Making the kernels more GPU happy 2019-06-03 14:50:54 +01:00
5622faf226 pragma once ifdef guard 2019-06-03 14:50:26 +01:00
82ecd520c7 Macos happy fix under nvcc 2019-06-03 14:48:50 +01:00
620965781e MSource::Convolution remove test code 2019-06-02 13:44:19 +01:00
9c18638b24 MSource::Convolution let mom argument be Nd dimensional 2019-06-02 13:41:39 +01:00
4bfe678218 MSource::Gauss Integer is unsigned... 2019-06-02 12:36:57 +01:00
fc6e584f2c MSource::Gauss fix sign in exponent of normalization + use correct types 2019-06-02 11:52:05 +01:00
7c3f400fc5 MSource::Gauss add parameters tA and tB 2019-06-02 00:12:15 +01:00
4bca2c17ce MSource::Convolution rename parameters 2019-06-02 00:04:07 +01:00
8d540a4e85 MSource::Gauss add mom parameter + avoid Cshifts 2019-06-01 23:56:14 +01:00
9ff459816f ReadBinary needs to do case insensitive name comparison (since I changed the default case of perambulator column names) 2019-06-01 13:50:27 +01:00
eb737daeb5 Merge branch 'develop' into feature/distil
* develop: (34 commits)
  Hadrons: EMLepton: Wall source
  Revert "cleaning up Kl2 contraction"
  cleaning up Kl2 contraction
  posibility to save/load schedules directly from the application parameters
  moving VERSION file to the empty ChangeLog one, this create compilation problems with #include <version> in recent versions of LLVM and case-insensitive FS (typically macOS)
  Added precision tuning to Hadrons parameterfile writing
  Kl2 QED cleanup
  Added ZFIMPL to SeqGamma
  Added ZFIMPL to SeqConserved module
  F1 ensemble running with 96%~ acceptance etc..
  Make detection of HPE 8600 automatic
  Added variables that were missing from wall source setup
  Exposed a coulomb/landau enum to the gauge fixing module
  Coulomb gauge added as an option
  More logging, timing, and 4d/5d logic for eigpack gauge transforms
  Added gauge transform option to eigpack IO
  Hadrons: Lepton Propagator for kl2, sign swap for antiperiodic boundary
  A2A Lepton-Meson Field contraction
  Verbose
  Iteratoin range fix
  ...
2019-05-31 18:20:43 +01:00
b120ef1fe4 Merge pull request #217 from guelpers/feature/EMlepwall
Hadrons: EMLepton: Wall source
2019-05-30 11:13:27 +02:00
166feb6483 Hadrons: EMLepton: Wall source 2019-05-30 10:07:08 +01:00
f569813b60 remove commented code 2019-05-29 17:07:07 +01:00
0190ada714 Merge branch 'develop' into feature/GaussianSmearing 2019-05-29 17:01:17 +01:00
de1a1dccb3 MSource::Gauss and MSource::Convolution: change LatticeComplex to ComplexField 2019-05-29 16:25:45 +01:00
0b3f40ce16 MSource::Convolution fix sign in Momentum 2019-05-29 16:06:10 +01:00
e35e8da111 Revert "cleaning up Kl2 contraction"
This reverts commit f244fed6ab.
2019-05-29 11:23:17 +02:00
6fdf93d695 move momentum phase from MSource::Gauss to MSource::Convolution 2019-05-28 17:26:55 +01:00
ffde81f22a Nsimd() and coalesced support 2019-05-25 12:44:07 +01:00
d8098f1ecd coalesced support 2019-05-25 12:43:31 +01:00
aca788cf4f Move coalesced read into tensors 2019-05-25 12:43:00 +01:00
6064f96fde MSource::Gauss remove superfluous comment 2019-05-24 20:18:37 +01:00
4e52e46a2c MSource::Gauss fix missing factor 2019-05-24 20:16:09 +01:00
6b27369ade MSource::Convolution use type PropagatorField 2019-05-24 16:07:08 +01:00
ab2e5f88cd add fields as input (for scheduler) 2019-05-24 15:57:30 +01:00
f244fed6ab cleaning up Kl2 contraction 2019-05-24 13:08:35 +01:00
9b3701ae27 posibility to save/load schedules directly from the application parameters 2019-05-24 13:08:20 +01:00
4ac27340b9 moving VERSION file to the empty ChangeLog one, this create compilation problems with #include <version> in recent versions of LLVM and case-insensitive FS (typically macOS) 2019-05-24 13:05:17 +01:00
c7c0a1065f Merge pull request #214 from guelpers/feature/kl2QEDseq
Kl2 contraction with sequential propagators
2019-05-23 20:31:41 +01:00
80947130f9 Merge pull request #215 from fionnoh/develop
Added precision tuning to Hadrons parameterfile writing
2019-05-23 18:44:58 +01:00
0aee73ea6b Added precision tuning to Hadrons parameterfile writing 2019-05-23 18:43:25 +01:00
e43d59045e add option mom to MSource::Gauss 2019-05-23 17:33:32 +01:00
e553678599 add modules MSource::Gauss and MSource::Convolution 2019-05-23 16:38:13 +01:00
0290ee1f6d Merge pull request #213 from fionnoh/develop
Added ZFIMPL to SeqConserved module
2019-05-23 13:46:02 +01:00
9a34edcf9f Kl2 QED cleanup 2019-05-23 13:43:22 +01:00
246f10001e Added ZFIMPL to SeqGamma 2019-05-23 12:42:40 +01:00
e675c6a48c Merge remote-tracking branch 'upstream/develop' into feature/kl2QED 2019-05-23 12:41:54 +01:00
a66d110b88 Added ZFIMPL to SeqConserved module 2019-05-23 11:49:54 +01:00
918e673078 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-05-22 09:57:02 +01:00
44b53c3ba2 F1 ensemble running with 96%~ acceptance etc.. 2019-05-22 09:56:26 +01:00
2095c12eac Make detection of HPE 8600 automatic 2019-05-22 09:54:21 +01:00
a0e9f3b0a0 Plan for GPU port 2019-05-20 09:46:19 +01:00
ae5ad986e2 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED 2019-05-19 14:35:46 +01:00
a9342c6ae5 Udpdate TODO afer gianluc marge 2019-05-18 22:58:25 +01:00
ee6f96d85c Merge pull request #210 from grid-test-organisation/feature/gpu-port-develop
Cayley fermion functions for GPUs
2019-05-18 19:06:20 +01:00
77ca45ff49 Merge pull request #211 from fionnoh/develop
Enum for gaugefix and bug fix for wall source
2019-05-18 18:57:52 +01:00
4e9df9e93c GPU patches 2019-05-18 17:43:11 +01:00
9fe68857a9 Runs multiGPU with coalesced access on tesseract 2019-05-18 17:42:41 +01:00
37336c9e0c Allow compress to be either vector or scalar types 2019-05-18 17:41:13 +01:00
6c4da3bbc7 Stencil now runs with coalesced accesses 2019-05-18 17:40:35 +01:00
a584b16c4a Adding a non-blocking kernel launch 2019-05-18 17:39:54 +01:00
dbd7f3f0fc Added variables that were missing from wall source setup 2019-05-17 19:10:09 +01:00
d14512ee03 Exposed a coulomb/landau enum to the gauge fixing module 2019-05-17 19:01:52 +01:00
48b1c806ed Coulomb gauge added as an option 2019-05-17 17:36:32 +01:00
8ce7ebdca3 fixed contraction issue 2019-05-17 10:52:55 +01:00
435653490e fixed contraction issue 2019-05-17 10:50:15 +01:00
10a052d695 3 issues preventing compilation under clang. Marked these with FELIX_ISSUE and made minimal change to make compile (as fix not obvious) 2019-05-17 09:59:01 +01:00
acd5a01b65 some work on baryons 2019-05-16 15:11:50 +01:00
0a8b6724ef Merge pull request #209 from fionnoh/develop
Added gauge transform option to eigpack IO
2019-05-15 18:09:44 +02:00
ce102ac550 More logging, timing, and 4d/5d logic for eigpack gauge transforms 2019-05-15 14:31:25 +01:00
94accec311 Added gauge transform option to eigpack IO 2019-05-15 13:35:47 +01:00
1a82533d22 fix inner product with thrust reduction 2019-05-14 15:35:54 +01:00
ec7d96ce3b Merge branch 'develop' into feature/distil
* develop:
  Hadron WeakEye and A2ALoop bug fixes, and WWVVContraction bug fix
  DiskVector: fix of memory bug triggering segfault when the cache is accessed following a certain pattern
  MFermion::GaugeProp fix for 4d fields
2019-05-14 13:10:40 +01:00
e3c56fd9b3 CayleyZeroCounters before benchmark loop 2019-05-13 15:52:00 +01:00
955cc7790f MooeeInvDag offloaded to GPU 2019-05-13 14:25:29 +01:00
1179123ac2 MooeeInv offloaded to GPU 2019-05-13 12:37:12 +01:00
d8512b03f8 Merge pull request #195 from nils-asmussen/fix_GaugeProp_4d
MFermion::GaugeProp fix for 4d fields
2019-05-12 21:31:18 +02:00
d90cf9d022 Merge pull request #207 from fionnoh/develop
Weak Hamiltonian and contraction bug fixes
2019-05-12 21:30:20 +02:00
79e930ba12 Hadrons: Lepton Propagator for kl2, sign swap for antiperiodic boundary 2019-05-10 12:46:18 +01:00
22e35c9ddd M5Ddag offloaded to GPU 2019-05-10 12:23:39 +01:00
698b45e163 remove unused typedef 2019-05-09 11:19:39 +01:00
f1744b3f01 M5D offloaded to GPU 2019-05-09 11:17:55 +01:00
2b3c22f03d bandwidth dependent on grid default precision 2019-05-08 12:01:11 +01:00
8423a05940 duplicate CayleyFermion5D for gpu 2019-05-08 11:51:37 +01:00
2acd8ece65 Hadron WeakEye and A2ALoop bug fixes, and WWVVContraction bug fix 2019-05-08 10:57:36 +01:00
b638509c61 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-05-08 10:51:04 +01:00
c16916cc45 Multiple local slice fixes 2019-05-06 10:35:42 +01:00
a865caf0d2 Forgot a const in IndexName only version of NamedTensor constructor 2019-05-03 22:17:25 +01:00
9ae4d369f3 Use the definition of the Perambulator Index names given in Hadrons::MDistil 2019-05-03 22:00:50 +01:00
edeb590818 DiskVector: fix of memory bug triggering segfault when the cache is accessed following a certain pattern 2019-05-03 17:09:47 +01:00
ec24a1f828 Fixed 2 bugs in LapEvec: 1) InsertLocalSlice 2) ensure convergence assertion stops entire machine 2019-05-03 16:03:56 +01:00
0efe63f6fa 3D smearing fix 2019-05-02 19:37:59 +01:00
b7ead6c16a Fixed bug: iff stout smearing disabled then gauge field uninitialised 2019-05-02 18:20:49 +01:00
d9438627d9 M5D benchmark without vector copy overhead 2019-05-02 11:10:57 +01:00
b23305dbe2 fix M5D flop count 2019-05-02 11:08:21 +01:00
d3b5c02e2d measure M5D bandwidth and fix M5D flop count 2019-05-02 11:02:39 +01:00
8b6541fb60 Fix gpu MultRealPart and MaddRealPart bug 2019-05-02 10:58:17 +01:00
6da9aa9971 replace std::vector with Vector in benchmark 2019-05-02 10:56:22 +01:00
44e0360b97 replace std::vector with Vector 2019-05-02 10:55:36 +01:00
9003c4a07c allocator copy constructor (to be fixed) 2019-05-02 10:53:37 +01:00
b52fa38f8c seed initialisation of RNG5 2019-05-02 10:36:09 +01:00
3f1c4d8789 fix comment hash 2019-05-02 10:24:36 +01:00
62692b68b9 I'd forgotten that Intel '17 doesn't like auto var{value}; syntax 2019-05-01 20:45:16 +01:00
311c35a15c Looking for fixes for Intel '17 compiler errors. std::cout << complex number ? 2019-05-01 18:22:08 +01:00
a3fe57f430 NamedTensor writes to tag NamedTensor by default (not filename) - so still usable in case user renames file.
Also tweaked tensor index name checking (which is used to ensure tensor is correct type)
2019-05-01 18:11:37 +01:00
8dc0587621 Post Michael / Felix review. Ready for Peter / Antonin review 2019-05-01 13:04:51 +01:00
cfe5fa7a35 1) Don't write Laplacian eigenvectors to disk 2) Add a test that loads perambulators from disk 2019-05-01 09:50:23 +01:00
e72e26c899 Get rid of unnecessary multiFile options 2019-05-01 08:53:08 +01:00
334f29becb Fairly close to ready for release. Felix and I to review, then submit for release 2019-04-30 23:53:57 +01:00
e56ead55ef WIP 2019-04-30 14:41:48 +01:00
4f0631615f A2A Lepton-Meson Field contraction 2019-04-30 12:04:59 +01:00
d74d443d1b Pre-release cleanup in progress 2019-04-29 22:18:29 +01:00
4203105104 Part-way through release tidy-up 2019-04-29 18:40:38 +01:00
c2cd0e15d7 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED
Conflicts:
	Grid/qcd/action/fermion/DomainWallFermion.h
	Grid/qcd/action/fermion/FermionOperator.h
2019-04-29 12:07:20 +01:00
ac19c0e04f This will need to be removed eventually, but should save us fiddling about with each release 2019-04-29 09:20:08 +01:00
b48ca8a6ef Merge branch 'develop' into feature/distil
* develop: (36 commits)
  Mobius 2+1f sign off.
  Integrator logging on by default
  RHMC for mobius
  HMC make file
  Update
  Simple check
  Simple checks
  Monius HMC
  Changes locally
  Power method
  Momentum rescaling
  Bounds checking
  Bounds checking
  Scale momentum convention to CPS/UKQCD MD time
  Add bounds checking
  Updated documentation after Peter's review. 1) Removed version numbers from Grid dependencies 2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
  Remove bundled Eigen stuff
  Fix typo so it matches develop
  Remove bundled source from my local repository
  Slightly generalize interface to SchurRedBlackBase and derived solver classes so we can pass forecasted initial guesses in EOFA heatbath correctly
  ...
2019-04-29 08:37:39 +01:00
c48ae4f3ad 1) Only the boss should write the perambulator - possibly was a source of intermittent corruption?
2) Implemented and test a perambulator conversion utility in Test_distil (commented out near the start of main)
2019-04-28 23:24:57 +01:00
fb74de0798 Making sure Hdf5 is an optional dependency (default to binary writer if not present) 2019-04-28 20:23:44 +01:00
adc1eaee68 Switched to Hdf5 format for perambulators. Ready for first test on Tesseract. 2019-04-28 17:53:42 +01:00
60330e05a3 NVCC wacky compiler options frozen. Possibly Cuda 9.2 specific 2019-04-28 07:39:33 +01:00
f9b8c0cccf Vector changes for UVM 2019-04-28 07:38:57 +01:00
3cad67e569 Compile on tesseract 2019-04-28 07:38:09 +01:00
170ba4e619 Ensure different MPI ranks use different GPUs. The mapping works on Tesseract. 2019-04-28 07:32:30 +01:00
204a090497 Inner product is not working on GPU. Why? 2019-04-28 07:31:56 +01:00
3c717c47ef GPU no compile on Wilson Multigrid fixed 2019-04-28 07:31:19 +01:00
5aca4e8670 Just realised that the trace is at every lattice site, so moved the check for no smearing further up 2019-04-26 17:23:18 +01:00
e223d0b99f Need to validate range about which exp^iQ is considered unity 2019-04-26 16:00:35 +01:00
2e220456d3 First attempt at minimising smearing 2019-04-26 15:54:05 +01:00
4333d97958 fixed parameter 2019-04-26 14:29:21 +01:00
55c9c45d4b Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-04-26 14:28:01 +01:00
e70e03f560 started stout smearing for small w 2019-04-26 14:27:40 +01:00
ff5e2e0f47 Debug output fix. Meant to print the rho matrix for stout smearing ... not the address of the function that creates it 2019-04-26 12:30:41 +01:00
4f3d1ea6e8 Two heads are better than one. Combined effort and hopefully spatial smearing now fixed! 2019-04-26 12:18:11 +01:00
b1768ba820 Urgh! 2019-04-26 10:04:27 +01:00
3ac5a69a57 Ready to test spatial smearing (again) 2019-04-26 08:54:30 +01:00
50a74eaea3 Doesn't compile. Does it still need to be maintained? 2019-04-26 08:33:10 +01:00
8419fbb335 Renamed PerambLight module. Check with Felix whether Test_24 and Test_tesseract still need to be maintained 2019-04-26 08:23:15 +01:00
23a9b93cda More dependencies for Distil.hpp move and (C) 2019 only 2019-04-26 07:39:05 +01:00
ecdc3ddebf Moved Distil.hpp and added GNU license to all files 2019-04-26 07:24:56 +01:00
df41de4cb6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-24 12:02:50 +01:00
6d0b985697 Verbose 2019-04-24 06:29:52 +01:00
94ebcf551c Iteratoin range fix 2019-04-24 06:28:14 +01:00
6fd4b0be91 Evolving HMC status 2019-04-23 21:54:45 +01:00
7894ea6263 Now have mixed precision solves in the 2f sector 2019-04-23 21:54:19 +01:00
73d4676997 Action and Deriv solvers allowed to differ 2019-04-23 21:53:44 +01:00
262a73c964 COmment improvement 2019-04-23 21:52:58 +01:00
5921b1d2b9 Layout/whitespace changes 2019-04-23 21:52:33 +01:00
6505efcb57 Set iteration count if guess is already good 2019-04-23 21:51:57 +01:00
b595f58e4c Allow HMC to acces matrix 2019-04-23 21:51:23 +01:00
b0de7ab7db Extra do nothing guesser 2019-04-23 21:50:45 +01:00
e1124d9572 Integrator verbosity updates 2019-04-23 21:50:15 +01:00
606698511c Seems we've not been keeping the test up-to-date 2019-04-22 19:03:24 +01:00
a97b814f0c Remove redundancy in LapEvec filename 2019-04-19 14:09:36 +01:00
7214681e11 Spatial smearing doesn't work yet. Fixed inconsistency in naming of perambulator in PerambLight.hpp 2019-04-19 13:54:25 +01:00
143b75956c Stout smearing 3D fixes. Changed LapEvec to perform spatial smearing only 2019-04-19 11:54:02 +01:00
d416156c16 Mobius 2+1f sign off. 2019-04-19 07:57:08 +01:00
cd8d939a1a Integrator logging on by default 2019-04-19 07:54:17 +01:00
760cfe294c RHMC for mobius 2019-04-19 07:53:54 +01:00
4a4203c610 fixed stout smearing for now 2019-04-18 19:10:49 +01:00
2b598294c9 added distil source module 2019-04-18 17:47:09 +01:00
13eaf21b5c HMC make file 2019-04-18 11:53:26 +01:00
1403ab231b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-18 11:06:02 +01:00
0368fbcde8 Update 2019-04-18 11:05:53 +01:00
2dd0ec7862 Merge pull request #186 from djm2131/feature/eofa-bug-fixes
Merge feature/eofa-bug-fixes into develop
2019-04-17 14:54:06 +01:00
f4241e59ba Merge pull request #200 from mmphys/feature/XcodeDoc
Updated documentation after Peter's review.
2019-04-17 14:51:19 +01:00
26b1d2df2d Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-17 12:08:06 +01:00
bc14e86812 Simple check 2019-04-17 12:07:42 +01:00
780a67844e Simple checks 2019-04-17 12:07:17 +01:00
8b7805200f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-17 12:05:09 +01:00
2871dec6c0 Monius HMC 2019-04-17 12:04:57 +01:00
abde12433e Changes locally 2019-04-17 12:03:20 +01:00
1f88ba4e39 Power method 2019-04-17 12:03:05 +01:00
ea5b3ed8a2 Momentum rescaling 2019-04-17 12:01:06 +01:00
a104115c7d Bounds checking 2019-04-17 11:56:46 +01:00
b899042d81 Bounds checking 2019-04-17 11:55:43 +01:00
3e712fe643 Scale momentum convention to CPS/UKQCD MD time 2019-04-17 11:54:17 +01:00
f4723e07c5 Add bounds checking 2019-04-17 11:52:23 +01:00
9ed2d02bb2 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-12 12:11:06 +01:00
d111c70c38 Merge branch 'develop' into feature/distil
* develop:
  Make sure Grid::Serializable can write Eigen Tensors to output streams. NB: 1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember 2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
  ... this time without the new Distillation modules ...
  Eigen tensor serialisation fixes after Antonin's review
  Iterator added. Will wait for review comments before finalising.
  Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions. E.g. assuming T::rank is an int, then objects defined like so:     const auto rank{T::rank}; should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
  Pushed paboyle's changes: Updates for clang happy
  Merge paboyle's no compile in single precision Intel 2019 fix
  Eigen::Tensor serialisation. Tested on single and double precision builds
2019-04-10 13:14:24 +01:00
50d016340c Merge pull request #190 from mmphys/feature/distil-checkin
Eigen::Tensor serialisation. Tested on single and double precision builds
2019-04-10 12:49:06 +01:00
f7b4fd0f69 Make sure Grid::Serializable can write Eigen Tensors to output streams. NB:
1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember
2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
2019-04-06 15:40:23 +01:00
ed2427d5f7 Make sure Grid::Serializable can write Eigen Tensors to output streams. NB:
1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember
2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
2019-04-06 15:37:53 +01:00
1f1aa92f14 Updated documentation after Peter's review.
1) Removed version numbers from Grid dependencies
2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
2019-04-06 13:42:39 +01:00
ea2f34de7b Updated documentation after Peter's review.
1) Removed version numbers from Grid dependencies
2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
2019-04-06 13:37:47 +01:00
00963a7499 twist and boundary conditions for free propagator 2019-04-05 10:08:27 +01:00
63dc0fa7e9 Fixed memory leak ... without breaking semantics of prior code. Possibly should change the semantics? For Peter / Antonin to comment 2019-04-04 16:00:17 +01:00
5e6104e683 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-04-04 12:13:35 +01:00
25e4ee3a49 3D Stout smearing added 2019-04-04 12:13:16 +01:00
82a77f9960 ... this time without the new Distillation modules ... 2019-04-03 23:02:26 +01:00
00b4139c16 Eigen tensor serialisation fixes after Antonin's review 2019-04-03 22:48:07 +01:00
4161429dcc Serialisation fixes after Antonin's review 2019-04-03 22:30:07 +01:00
3e9c757b3b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-03 16:56:48 +01:00
b5eb97206b Merge branch 'develop' into feature/distil
* develop:
  MGauge::GaugeFix use standard convention for fields
  fix bug: MGauge::GaugeFix should not modify its input
  add gauge transformation matrix as output to module MGauge/GaugeFix
2019-04-03 16:24:49 +01:00
ecf736e6bf Merge pull request #193 from nils-asmussen/fix_GaugeFix_inputmod
Fixes #192 and adds gauge transformation matrix as output.
2019-04-02 18:28:50 +01:00
f22ab5e1bc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-02 11:29:59 +01:00
72f959c0b8 MGauge::GaugeFix use standard convention for fields 2019-03-29 16:51:21 +00:00
63001d3fa6 fix bug: MGauge::GaugeFix should not modify its input 2019-03-29 16:51:11 +00:00
b1d3d1f1a9 add gauge transformation matrix as output to module MGauge/GaugeFix 2019-03-29 16:51:00 +00:00
c2250fa124 MFermion::GaugeProp fix for 4d fields 2019-03-29 16:36:56 +00:00
84940fbdf0 Hadrons: Lepton Propagator for kl2 2019-03-28 10:15:09 +00:00
0da906cf66 Merge branch 'develop' into feature/distil
* develop:
  Documentation for using Grid with Xcode on Mac OS
2019-03-27 23:08:29 +00:00
0a270b3e93 Merge pull request #191 from mmphys/GridXcode
Documentation for using Grid with Xcode on Mac OS
2019-03-27 22:53:35 +00:00
6536bed8a4 Documentation for using Grid with Xcode on Mac OS 2019-03-27 20:51:20 +00:00
3decb5f886 Merge branch 'develop' of github.com:paboyle/Grid into feature/distil
* 'develop' of github.com:paboyle/Grid:
  endianness fix in resilient IO
2019-03-27 20:39:23 +00:00
faa8bb9bc6 Fixed funny memory leak 2019-03-27 17:55:52 +00:00
4c02ed6d0c Updated GridXcode documentation 2019-03-27 13:54:39 +00:00
f757b80e1c tried to fix mem leak 2019-03-27 12:00:36 +00:00
b8581be1da : 2019-03-27 11:59:06 +00:00
79160011a1 endianness fix in resilient IO 2019-03-26 16:06:13 +00:00
9fce1263be Fixed bug in LapEvec if machine running spread-out in time 2019-03-26 13:24:39 +00:00
ae565b006a Compiling in single-precision now works 2019-03-25 22:56:01 +00:00
8502660023 Begin fixes for single precision 2019-03-25 20:40:05 +00:00
47f5b1e2b5 Iterator added. Will wait for review comments before finalising. 2019-03-25 18:19:55 +00:00
625a97a466 cosmetic 2019-03-25 18:16:04 +00:00
bce2766fef Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-25 16:38:42 +00:00
ce501afec6 bugfix 2019-03-25 16:38:25 +00:00
1d10a3b3de Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  bugfix
2019-03-25 15:50:57 +00:00
d1e02f50ff Added iterator for Eigen tensors 2019-03-25 15:50:29 +00:00
48b03c4590 bugfix 2019-03-25 15:45:35 +00:00
b3b9e608e1 added new module for noises 2019-03-25 14:13:03 +00:00
4e87cbd400 Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions.
E.g. assuming T::rank is an int, then objects defined like so:
const auto rank{T::rank};
should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
2019-03-23 09:28:41 +00:00
a381d34f37 Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions.
E.g. assuming T::rank is an int, then objects defined like so:
    const auto rank{T::rank};
should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
2019-03-23 09:24:15 +00:00
4fc045b563 added module to load perambulators from disk 2019-03-22 13:50:47 +00:00
fbf286b0e3 added Spin dilution 2019-03-22 13:30:11 +00:00
9dc3fe9922 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  modules list
2019-03-22 13:00:06 +00:00
f0c2108acf Pushed paboyle's changes: Updates for clang happy 2019-03-22 12:59:14 +00:00
6c9029fab7 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-22 12:41:56 +00:00
8700dd4d0d modules list 2019-03-22 12:41:53 +00:00
9c16391e55 Merge branch 'develop' into feature/distil
* develop:
  Updates for clang happy
2019-03-22 12:08:50 +00:00
93a5fc083f Updates for clang happy 2019-03-22 11:39:22 +00:00
6d1de8ed2e Merge paboyle's no compile in single precision Intel 2019 fix 2019-03-21 16:48:08 +00:00
685d9bafef Merge branch 'develop' into feature/distil
* develop:
  No compile in single precisoin Intel 2019 fix
2019-03-21 16:36:48 +00:00
116dde31eb No compile in single precisoin Intel 2019 fix 2019-03-21 14:13:33 +00:00
d2d26b302d Removed the module we don't need from modules.inc (so make now works)
i.e. removed Modules/MDistil/PerambMultipleSolves.hpp from Hadrons/modules.inc
2019-03-20 22:59:20 +00:00
12d8bf1ced Eigen::Tensor serialisation. Tested on single and double precision builds 2019-03-20 22:27:41 +00:00
88cb004731 Fixed single-precision issues in Test_serialisation 2019-03-20 22:05:16 +00:00
a66bb8acba fixed possible memory leak 2019-03-20 14:41:36 +00:00
4ae35000a9 removed module which we do not need 2019-03-20 13:36:57 +00:00
02b96b4602 Fixed module list (messed up when I merged from develop) 2019-03-20 11:20:40 +00:00
11dded61e8 Merge branch 'develop' into feature/distil
* develop: (29 commits)
  precision fix
  Updates after review with Peter.
  Wilson clover multi grid for lime lattice
  Recommendations for Traits classes
  Hadrons: uninitialised pointer fix (might have been harmless)
  Hadrons: beware of the nasty uninitialised twists
  Smearing test. Test on free field.
  Smearing for quark observables
  Smearing
  Hadrons: XML validator utility
  display relative norm during field IO norm check
  possibility to set a build number
  IO norm check on relative norm
  Output field norm check during IO
  Hadrons: random vector utility module I/O
  quieter initialisation
  fix patch command for eigen in bootstrap.sh
  Mres changes and gauge xform mat changes
  Hadrons: 32 bit I/O directly in Lanczos module
  Hadrons: copyright update
  ...

# Conflicts:
#	Grid/tensors/Tensor_traits.h
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-03-20 10:35:36 +00:00
d921a99b1a precision fix 2019-03-19 17:07:40 +00:00
9790926cc5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-03-19 16:35:48 +00:00
24cf3b9df5 Ignore Version.h as it's created by automake/autoconf 2019-03-19 12:12:39 +00:00
9c8aa2047d Put GridXcode doc in subdirectory 2019-03-19 07:33:19 +00:00
204cfa1c5a Added documentation for Grid using Xcode 2019-03-19 07:28:29 +00:00
fe6845d38b Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-18 14:44:18 +00:00
a6adb85a1b Merge pull request #185 from mmphys/feature/trait-recommend
Recommendations for Traits classes
2019-03-18 12:22:42 +00:00
bff4eeec41 Added disclaimer on half-precision types 2019-03-18 12:15:25 +00:00
98b5b61fea Remove bundled Eigen stuff 2019-03-15 19:44:28 -04:00
6896c57d7c Fix typo so it matches develop 2019-03-15 19:34:36 -04:00
b3d480a978 Remove bundled source from my local repository 2019-03-15 19:17:03 -04:00
bb731c97d6 Slightly generalize interface to SchurRedBlackBase and derived solver classes so we can pass forecasted initial guesses in EOFA heatbath correctly 2019-03-15 19:10:56 -04:00
974003ae96 Fix sign convention of ExactOneFlavourRatioPseudoFermionAction::deriv() to match force conventions for Integrator class 2019-03-15 19:04:29 -04:00
93348775af Resolved merge conflict 2019-03-15 19:01:37 -04:00
d1fe4dce33 new idea to get multiple perambulators 2019-03-15 10:28:02 +00:00
50ca3101de bug in multiSolves and new test prog 2019-03-13 17:25:55 +00:00
0faf40e207 last commit did not compile - fxied this 2019-03-13 13:24:18 +00:00
5313e44d11 some cleanup 2019-03-13 13:15:12 +00:00
6bb9b67c93 externalised gauge field reading to hadrons module 2019-03-13 12:09:12 +00:00
a0405c6d84 PerambMultipleSolves.hpp compiles (not had time to test) 2019-03-12 14:01:29 +00:00
c2a3231cdf added testing module for multiple perambulators 2019-03-11 18:05:39 +00:00
5fb2ee89bb modified test so that it runs 2019-03-08 16:50:21 +00:00
608a98d870 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-08 16:28:34 +00:00
2df396380d solver is now external 2019-03-08 16:28:21 +00:00
64ba664637 changed debug options 2019-03-08 12:25:00 +00:00
4a70b2ffd4 Aslash insertions work now? 2019-03-08 12:23:22 +00:00
2d659015ff Serialisation is fully functional. Ready for review. 2019-03-08 00:30:43 +00:00
e63019ac50 Tensor serialisation is fully functional 2019-03-08 00:01:45 +00:00
dde118fed9 added everythong to compute sequential aslash fields 2019-03-07 17:36:53 +00:00
1538bf8c34 added everythong to compute sequential aslash fields 2019-03-07 17:36:22 +00:00
4abc498ae3 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-07 15:34:10 +00:00
93dfbfbfcd added module to compute perambulator from a solve 2019-03-07 15:33:50 +00:00
f9e273d4bf Making sure same as Traits-recommend 2019-03-07 14:33:04 +00:00
91cffef883 Updates after review with Peter. 2019-03-07 14:30:35 +00:00
584fa0a633 Changes after review with Peter 2019-03-07 12:53:34 +00:00
d3935ae7fc Hadrons: some updates in WeakMesonDecayKl2 2019-03-06 15:27:59 +00:00
73cdca3973 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-06 13:55:51 +00:00
d716f8a0c9 new module for baryon contraction 2019-03-06 13:55:36 +00:00
aa24f04911 Changed EigenIO to use GridTypeMapper type traits 2019-03-06 12:55:05 +00:00
0b426bf9f6 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED
Conflicts:
	Hadrons/Modules.hpp
	Hadrons/modules.inc
2019-03-06 11:28:59 +00:00
1880e6d12d Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-06 11:16:34 +00:00
4a00513e65 Moving Eigen trensor utilities to separate (optional) header 2019-03-06 11:16:22 +00:00
7718ee199a efficient baryon test program 2019-03-05 17:16:42 +00:00
d7c7bff065 added output for source meson fields on all tsrc 2019-03-05 12:01:55 +00:00
802675f062 baryons should compile now... 2019-03-04 17:31:21 +00:00
acd25d0d01 Wilson clover multi grid for lime lattice 2019-03-04 11:30:15 +00:00
d56d8c923f Replaced an error in A2AUtils.h that was stopping the build with an assert() 2019-03-02 00:36:53 +00:00
00c3c6fc54 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-02 00:24:47 +00:00
b3d4ba8657 Fixed issues with Eigen Tensor serialisation. Fixed issues with precision to text streams 2019-03-02 00:24:37 +00:00
a4d578bd5d baryons work now??? 2019-03-01 14:44:39 +00:00
7653649389 baryons working now 2019-03-01 12:57:41 +00:00
a344a2227e Fixing build errors 2019-02-28 20:30:16 +00:00
b7db99967a Recommendations for Traits classes 2019-02-28 20:06:59 +00:00
4b9200b35c Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-28 19:06:36 +00:00
91be028507 Still one issue on write 2019-02-28 19:06:25 +00:00
3b05f91f5c Prototype for template traits recommendations 2019-02-28 19:04:44 +00:00
8804271339 efficient baryons compile! 2019-02-28 16:32:40 +00:00
6d9f377913 added parity 2019-02-28 11:05:31 +00:00
18b603c5ae simple but hopefully efficient baryon field 2019-02-28 10:27:05 +00:00
e9784572af baryons... 2019-02-27 17:51:25 +00:00
b930eda69d Merge branch 'develop' of github.com:paboyle/Grid into develop 2019-02-27 02:27:46 +00:00
7852181c2c Hadrons: uninitialised pointer fix (might have been harmless) 2019-02-27 02:27:40 +00:00
bdf87bc994 Hadrons: beware of the nasty uninitialised twists 2019-02-27 02:27:09 +00:00
f168a9e7ee continued with baryons 2019-02-26 16:41:52 +00:00
50b6db75da Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-02-26 15:57:09 +00:00
df065f1d57 first test configs 2019-02-26 15:57:01 +00:00
136e7b2314 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-02-26 11:31:36 +00:00
1ea64b24fe Smearing test. Test on free field. 2019-02-26 11:31:17 +00:00
8f661f6c05 Smearing for quark observables 2019-02-26 11:31:00 +00:00
ae9e248c95 Smearing 2019-02-26 11:29:12 +00:00
578eb177e7 Tweaked format and memory use on Xml format. Still crashes (out of memory) on large read on my laptop 2019-02-25 22:03:21 +00:00
81b3f3d2ca Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 15:39:07 +00:00
7c7ffa3b10 Added text read/write 2019-02-25 15:38:47 +00:00
1f098ceecf Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 15:36:23 +00:00
c47c1a2472 started working on baryons - this time efficiently 2019-02-25 15:36:11 +00:00
ec45b16840 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 14:10:34 +00:00
9288019789 Added Xml IO (has one deficiency: the format for multi-dimensional data is flat) 2019-02-25 14:10:24 +00:00
351ffe73cd Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-02-25 14:06:09 +00:00
9c04139362 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 12:40:44 +00:00
cfc14a7432 more adjustments to test 2019-02-25 12:40:32 +00:00
31e40c26fa Oops. Forgot to delete SortNode (prevented linking) 2019-02-25 11:35:33 +00:00
3f2fe5c7e7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 11:18:34 +00:00
76b6e8a01e first tesseract test 2019-02-25 11:18:25 +00:00
f9543982e4 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 11:07:43 +00:00
3c9f2d4106 Chunking layout reasonably efficient. Looks for small prime factors of each dimension, falling back to approximate size if needed. 2019-02-25 11:07:29 +00:00
6160795a43 dt^2 term comments 2019-02-24 15:23:20 +00:00
ded2d5c3ab HMC directory 2019-02-24 15:22:57 +00:00
04255128ef HMC directory 2019-02-24 15:22:17 +00:00
a9a3248cb5 More precision in rect force test 2019-02-24 15:21:19 +00:00
7c461dc664 Bounds checking plan setup 2019-02-24 15:19:48 +00:00
15fddde9bf ConstEE override in Clover 2019-02-24 14:44:43 +00:00
048397d880 Default tau spacing should be longer c.f. Zbigniew Srocinsky thesis 2019-02-24 14:43:22 +00:00
196c9e4a4a Better conformable check with message 2019-02-24 14:42:52 +00:00
6a0823718e Make ConstEE except override in clover 2019-02-24 14:41:59 +00:00
22476cc5a3 Power method estimator of spectral range 2019-02-24 14:37:56 +00:00
cb16c96dc7 Hadrons: XML validator utility 2019-02-22 18:41:26 +00:00
cad26a736e quick&dirty fix for g5*field 2019-02-22 17:05:16 +00:00
4f2ac433f1 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-22 16:31:26 +00:00
f9e505108b test Aslash 2019-02-22 16:31:17 +00:00
d2aced13da Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-02-22 16:30:40 +00:00
03d031d623 tesserct test 2019-02-22 16:30:22 +00:00
44a2d4854a Ensured Hdf5 chunk size always less than 4GB 2019-02-22 15:14:32 +00:00
292ff33f7f Removed issue with std::string_literal 2019-02-21 16:51:05 +00:00
55886cf9db ran make_module_list.sh 2019-02-21 16:14:13 +00:00
c640923159 Fixed reference to depth from test 2019-02-21 15:48:52 +00:00
752530f352 Gotten rid of c++17 in Test_serialisation.cc 2019-02-21 14:43:07 +00:00
34b9450fc9 Gotten rid of c++17 2019-02-21 14:22:48 +00:00
5d6462b706 bugfix 2019-02-21 11:13:10 +00:00
f70c5b004a some cleanup in Baryon2pt 2019-02-20 12:56:13 +00:00
5bb9de9242 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-19 17:37:37 +00:00
982a24514b Binary IO also implemented and tested 2019-02-19 17:37:21 +00:00
97c6f770b4 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-19 17:22:41 +00:00
4522f1e831 separated final 2pt Contraction 2019-02-19 17:22:30 +00:00
c14547ddbe EigenIO writing rationalised. All indices (trivial or not) written 2019-02-19 16:12:55 +00:00
63c97db414 Prior to rationalising 2 versions of BaseIO::write (scalar and vector) 2019-02-19 13:29:08 +00:00
6ebb32ffbf Rationalised Test_serialisation 2019-02-18 21:40:53 +00:00
07c97cb424 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-18 17:12:36 +00:00
04b58de5de Read-back working. 2019-02-18 17:12:27 +00:00
6e822b7201 added sign for contraction sum 2019-02-18 15:21:13 +00:00
625ccfcd72 continued baryon contraction code 2019-02-18 13:10:34 +00:00
c77069244d Nearly ready. Just finishing off readback and compare 2019-02-18 08:55:50 +00:00
9815ddb853 Started read routines. Introduced readMultiDim and tested I didnt break anything 2019-02-16 19:30:33 +00:00
74a3a5b825 Fixed existing bug in Hdf5Reader::readDefault for std::vector<U> 2019-02-16 18:45:46 +00:00
00e9416e0a Tweak to initialisation example 2019-02-16 17:08:22 +00:00
b6803a070a Making sure I understand row-major vs column-major ordering 2019-02-16 16:18:28 +00:00
bfd2770657 started on baryon flavour sums 2019-02-15 15:51:46 +00:00
668b1e77c7 small changes 2019-02-15 15:31:53 +00:00
e51744260f Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 14:32:26 +00:00
e0987d7d81 first contraction version done 2019-02-15 14:32:17 +00:00
26b94d7bda Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 13:53:00 +00:00
df0c8b5d84 Test of Eigen slices 2019-02-15 13:52:49 +00:00
a111d814db Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 10:47:41 +00:00
e8bd8767c0 Get rid of declarations inside constexpr functions. if constexpr warning remains 2019-02-15 10:06:15 +00:00
8cb96cb693 Hmmm lots of warnings depending on compiler ... 2019-02-14 19:17:12 +00:00
b9bee45277 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-14 19:05:47 +00:00
bee24655cd Finalising traits 2019-02-14 19:05:35 +00:00
886c895f81 baryon field structure is now eigentensor - started on contractions for 2pt functions 2019-02-14 16:44:54 +00:00
e37614bde4 display relative norm during field IO norm check 2019-02-14 16:23:50 +00:00
042bad2ced possibility to set a build number 2019-02-14 13:58:17 +00:00
59c8cc1588 Minor bugfix 2019-02-13 22:11:24 +00:00
11467a994d Enough for tonight 2019-02-13 21:48:35 +00:00
9f2ca98dfc enseble can now be specified in LapEvec 2019-02-13 13:54:31 +00:00
bf434b6bef Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-13 12:14:18 +00:00
41ff592515 Moved serialisation tests into Test_serialisation 2019-02-13 12:14:01 +00:00
48ec937c55 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-13 11:48:57 +00:00
65731546b7 merge... 2019-02-13 11:48:34 +00:00
76c6a6772a Added rank_non_trivial 2019-02-12 22:15:55 +00:00
e7048231bc Working version with additional Grid traits pre: review by Antonin 2019-02-12 13:59:48 +00:00
49babeab19 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-11 23:26:46 +00:00
fb2cb3015e Writing of Eigen::Tensor of grid objects now works (for Hdf5) 2019-02-11 23:26:18 +00:00
53f45d2c7e Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-11 17:39:55 +00:00
d889cebc60 unique string is now used 2019-02-11 17:39:42 +00:00
9a225235b6 Can write both fixed and dynamic sized tensors (small tidy) 2019-02-11 17:15:38 +00:00
dff7d9261d Can write both fixed and dynamic sized tensors 2019-02-11 15:47:40 +00:00
6f2663edf6 Serialisation of an object containing an Eigen::Tensor works for Hdf5. Still quite a lot of tidying up to do. 2019-02-10 23:19:20 +00:00
5bc0857412 IO norm check on relative norm 2019-02-10 22:12:47 +00:00
b540dc1cee Output field norm check during IO 2019-02-10 21:41:17 +00:00
7672bb6434 Hadrons: random vector utility module I/O 2019-02-10 21:25:25 +00:00
f80c548365 quieter initialisation 2019-02-10 20:47:35 +00:00
d5024bd07e Hdf5 writing of scalar (i.e. no Grid subtypes) Eigen::Tensor works. But issues when adding Eigen::Tensor to serialisable object. 2019-02-10 15:33:16 +00:00
9c4189484a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-09 17:12:43 +00:00
3720103f41 Adding Eigen::Tensor still WIP 2019-02-09 17:12:36 +00:00
c4d27ee30f added parity operator to baryon fields 2019-02-08 15:49:52 +00:00
d26a5dce12 bugfix 2019-02-08 14:37:09 +00:00
5843a943d9 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-08 14:36:08 +00:00
c1341b8ed2 bugfix 2019-02-08 14:33:06 +00:00
6a4515d0cd baryons have now the correct (?) structure - also easier! 2019-02-07 12:27:57 +00:00
a0a39e4b00 Fixed initialisation of vector of Complex 2019-02-06 21:56:44 +00:00
b9fb16077c Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 21:37:54 +00:00
4b3c566c89 ../tests/hadrons/Test_hadrons_distil.cc 2019-02-06 21:36:46 +00:00
c8bcee6e97 Merge pull request #183 from nils-asmussen/fix-eigen-patch
fix patch command for eigen in bootstrap.sh
2019-02-06 14:36:36 +00:00
cbd2dfe53f Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 12:56:49 +00:00
6cdb1eb62c BContraction now computes what might be a baryon function, but probably isn't 2019-02-06 12:23:52 +00:00
6e0d43aef5 fix patch command for eigen in bootstrap.sh 2019-02-06 11:25:51 +00:00
ed7175076b Turned off warning of unused variable line 150 2019-02-06 09:32:13 +00:00
27677b3870 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 09:25:39 +00:00
7423f5af1a Examples of how to access Grid Tensors 2019-02-06 09:25:24 +00:00
c1257208e2 Mres changes and gauge xform mat changes 2019-02-05 23:43:00 +00:00
74c38822ed Hadrons: 32 bit I/O directly in Lanczos module 2019-02-05 21:56:51 +00:00
318c64adc2 Hadrons: copyright update 2019-02-05 19:13:37 +00:00
d5b053f86f Hadrons: 1 propagator loop construction now using A2A vectors 2019-02-05 19:12:38 +00:00
c60e50e3cb Hadrons: copyright update 2019-02-05 18:55:24 +00:00
08d8b1d5fb Hadrons: 4-quark eye 3-pt contractions 2019-02-05 18:53:20 +00:00
21d6dbe0b6 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-05 17:32:39 +00:00
1ee84509b5 added baryons project - not working yet 2019-02-05 17:32:26 +00:00
57e57d162f Removed Eigen::DontAlign attribute 2019-02-05 12:50:28 +00:00
90d6d28547 Hadrons: non-eye weak 3pt fix 2019-02-05 11:35:10 +00:00
5b0870bb19 Added Scalar_ length and Scalar_Unit_Size to Perambulator file for validation 2019-02-05 09:07:05 +00:00
7f5354630a Updated perambulator binary format to save payload in big endian format on disk 2019-02-04 23:07:59 +00:00
9c31305b8d Hadrons: test cleaning 2019-02-04 21:26:25 +00:00
2eb584fdf0 Hadrons: 4-quark non-eye 3-pt contractions 2019-02-04 21:24:07 +00:00
6b46834af8 Hadrons: archiving unmaintained or exotic modules 2019-02-04 21:23:30 +00:00
3692c7f1ef Hadrons: type alias cleaning and global correlator class (need to propagate) 2019-02-04 21:21:51 +00:00
0cf94587cd array with all gammas for convenience 2019-02-04 21:20:16 +00:00
008ac6b5ae Permabulator is read back from disk if it exists instead of being created 2019-02-04 12:06:32 +00:00
c7aa4e0c1f Perambulator filename can be specified in xml. NB: Perambulator binary format now includes data size in bytes to avoid type mismatches. 2019-02-04 11:30:30 +00:00
43bd918a47 Logging tweak 2019-02-03 21:48:50 +00:00
7eda54bb87 Only write indices with dimesion!=1 2019-02-03 20:58:58 +00:00
bd75b843fa Added checksum to data 2019-02-03 20:31:42 +00:00
8865bf5d7c Implemented perambulator read/write ... but in binary format. Will switch to Hdf5 when I have Antonins feedback 2019-02-03 17:05:19 +00:00
caabbcd951 minor change 2019-02-01 17:50:18 +00:00
48528c5b1d Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
Added index names to Perambulator
2019-02-01 15:31:27 +00:00
f7b90a0c14 Added index names to perambulator 2019-02-01 15:20:35 +00:00
a9848becb0 unsmeared sinks can now be computed - new test program available 2019-02-01 13:23:42 +00:00
7cc13f48d5 added some TODO comments; needs discussion 2019-01-31 16:54:11 +00:00
b6b267fd4b Fixed new test parameters 2019-01-31 15:11:12 +00:00
9671a61bb2 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-31 15:07:45 +00:00
d7dc617746 Switched perambulator to sue Eigen::Tensor (file write temporarily excluded) 2019-01-31 15:06:52 +00:00
68868c83ff Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-01-31 14:46:56 +00:00
32cb2e1a9a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-31 13:01:31 +00:00
3d31113337 added test t5 to compute meson fields of different quarks. Different nvec are allowed. 2019-01-31 13:01:16 +00:00
48b6f7e6ad Changed PerambLight<FIMPL> to PerambLight<GIMPL> 2019-01-31 12:37:00 +00:00
0da411fe60 LapEvec fixes 2019-01-31 12:28:38 +00:00
d7b9ed199d PerambLight fixes 2019-01-31 12:24:32 +00:00
7e74f7bec4 tsrc != 0 now works 2019-01-31 11:35:05 +00:00
dae7b30b92 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-30 21:16:30 +00:00
f7e4661ca0 Fixed grid3d leak in PerambLight 2019-01-30 21:16:09 +00:00
7b66197534 meson fields are now the same 2019-01-30 18:03:34 +00:00
c3273eff20 agreement up to laph vectors 2019-01-30 11:20:22 +00:00
67a3d7aeed added debug output, perambulators now agree up to 8 digits 2019-01-29 16:24:59 +00:00
d8831fe925 changed parameters to match Test_Distil 2019-01-29 13:40:26 +00:00
c7ceff6a21 Switched to Gauge field (GIMPL) 2019-01-28 12:28:35 +00:00
5580b3a7d1 bugfix in DistilVectors 2019-01-28 12:24:47 +00:00
33d8fb2dd9 Default 2019-01-25 19:21:12 +00:00
9f6f776460 ensured there is a default test to run 2019-01-25 19:14:22 +00:00
84fe36d084 meson functions work until to be saved 2019-01-25 17:26:43 +00:00
3438dde8df test prog now computes everything up to meson fields 2019-01-25 15:19:18 +00:00
aea49bc349 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-25 13:44:30 +00:00
9ef6f9878e test works up to perambulators now 2019-01-25 13:44:19 +00:00
708ca8585a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-25 13:26:56 +00:00
d15bf4b8e1 Added trajectory number to output file 2019-01-25 13:26:48 +00:00
7496da0987 bugfix in prambLight 2019-01-25 13:08:56 +00:00
2568f5b925 bugfix in prambLight 2019-01-25 12:37:18 +00:00
577cdf1d72 Simplified tests 2019-01-24 18:50:18 +00:00
f92ed659a7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-24 16:30:28 +00:00
dfb7fb1d9f LapEvec test works on --grid 4.4.4.8 2019-01-24 16:30:13 +00:00
a4c1ab6147 all modules linked in test prog 2019-01-24 16:12:19 +00:00
cf85f0388d Still debugging eigenvector parameters 2019-01-24 13:26:05 +00:00
00b0f75b0d Eigenvectors created. Still need to correctly set parameters for test. 2019-01-24 12:44:06 +00:00
b45586e81c Discovered bug root cause. setup() is called multiple times. Now ready to copy-paste the LapEvec code 2019-01-23 21:17:56 +00:00
2c7e6bf58b Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 15:20:06 +00:00
7c5a06f6d0 Trying to work out why LapEvec constructor not being called 2019-01-23 15:19:51 +00:00
068ef85b05 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 15:08:24 +00:00
a6ab742fdb added perambs to test 2019-01-23 13:58:20 +00:00
2062a8d578 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 13:00:20 +00:00
3d3e8f4f9f Structured objects passed into LapEvec 2019-01-23 12:59:55 +00:00
2756f16a5e created test prog for perambs 2019-01-23 12:49:20 +00:00
d7908c33de moved hard-coded parameters in DistilVectors to module input 2019-01-23 11:32:53 +00:00
4cc2ebc9e4 moved hard-coded parameters to module input 2019-01-23 11:26:07 +00:00
b8afa7314c Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-23 10:51:23 +00:00
be5605931c merge 2019-01-23 10:51:09 +00:00
09fa821510 Added remaining methods to Permabulator 2019-01-22 17:59:55 +00:00
f45d2d5dcc perambLight done, but SliceShare and Write does not work yet 2019-01-22 15:52:26 +00:00
0a82fae45c moved perambulator definition to shared header file 2019-01-22 15:06:45 +00:00
46b05aa9c5 cleaned up, deleted commented out old code 2019-01-22 13:48:44 +00:00
813c1ab1f1 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-22 13:28:09 +00:00
b1c27a141d DistilVectors complete and compiling - not tested at all! 2019-01-22 13:27:51 +00:00
81bb361299 Test program ready 2019-01-22 13:19:39 +00:00
79d533550d continued on DistilVectors.hpp 2019-01-21 16:45:31 +00:00
b8c106f320 working on DistilVectors, initialisation done and compiles 2019-01-21 16:04:18 +00:00
b74492a805 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-21 10:40:01 +00:00
c93a43f158 Added test program 2019-01-21 10:39:28 +00:00
0ff410ae19 copied perambulato code into PerambLight.hpp 2019-01-18 17:47:41 +00:00
ced30b61e2 added phi vectors - still commented out and does not compile otherwise 2019-01-18 16:38:13 +00:00
2b782df290 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-18 15:58:51 +00:00
f0f1ba0307 uses evec4d now 2019-01-18 15:58:10 +00:00
2343e621e6 Bananas 2019-01-18 13:32:27 +00:00
2568504821 small change 2019-01-18 13:23:03 +00:00
b821dde020 Initial version 2019-01-18 13:14:28 +00:00
ae3b053334 Initial version 2019-01-18 13:10:02 +00:00
9b6ddb6e54 Adding a norm of a general field check, so that for things other than gauge configs there is an analogue of plaquette norm.
Improve argument checking in the BinaryIO.h, as there looks to be some corruption issue intermittently on tesseract jobs.
Not clear where the root bug is.
2019-01-16 22:35:58 +00:00
447b772136 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED 2019-01-07 15:09:18 +00:00
c5e081d69c Re-Merge branch 'develop' into feature/gpu-port
Pull in Regensburg MultiGrid pull request
2019-01-03 01:50:16 +00:00
535a6aaf05 Update todo list 2019-01-02 22:07:51 +00:00
91a7fe247b Merge branch 'DanielRichtmann-feature/wilsonmg' into develop 2019-01-02 14:40:31 +00:00
8a1be021d3 Merge branch 'feature/wilsonmg' of https://github.com/DanielRichtmann/Grid into DanielRichtmann-feature/wilsonmg 2019-01-02 14:39:59 +00:00
e73b909a48 Make tests running past nvcc. Different NVCC versions proving tricky to keep happy. This is 9.2 2019-01-02 12:05:30 +00:00
a4d9200293 Fixing AVX 512 instantiation error. Need to move to extern templates urgently. 2019-01-02 00:27:07 +00:00
350508bdb3 pugixml problem 2019-01-01 16:38:54 +00:00
38852737e4 No compile fix on clang 2019-01-01 15:55:13 +00:00
802404c78c Remove warnings under NVCC and move parallel_for to thread-loop 2019-01-01 15:08:09 +00:00
0e9b591c1c NVCC warning suppression 2019-01-01 15:07:47 +00:00
c43a2b599a GPU support 2019-01-01 15:07:29 +00:00
8c91e82ee8 GPU clean up, remove parallel_for. Split into accelerator_loop, thread_loop
cases, and collides with parallel_for in thrust
2019-01-01 15:06:46 +00:00
9d866d062a GPU support improvements 2019-01-01 15:05:03 +00:00
3a4e397e72 Deprecating JSON, too hard to support under NVCC 2019-01-01 15:04:33 +00:00
2b6cfe555f Disable JSON on NVCC. Maybe unsupport JSON full stop. XML and JSON is too many formats in my view. 2019-01-01 15:03:50 +00:00
7df58dd883 Photon syntax gave problems with NVCC 2019-01-01 15:03:29 +00:00
4bf86ae60a NVCC clean up 2019-01-01 15:02:50 +00:00
07ee87ff5a GPU happy. Still need to prevent hand kernels being callable under NVCC 2019-01-01 15:00:33 +00:00
0c2498fe2f Explicit instantiation needed for NVCC 2019-01-01 13:55:12 +00:00
ad2e65dad5 GPU related updates 2019-01-01 13:54:40 +00:00
715babeac8 GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.
Local reduction to LatticeComplex and then further reduction.
2019-01-01 13:53:37 +00:00
3eae9a9e3f update NVCC flags 2019-01-01 13:49:15 +00:00
186aad065f Roll forward Eigen in attempt to make CUDA happy 2019-01-01 13:48:32 +00:00
bf5685eb11 Update todo list 2019-01-01 13:48:06 +00:00
4a96c067ae Remove warnings from NVCC 2019-01-01 13:43:09 +00:00
ab063f33c0 Offload the linear combinations in CG 2019-01-01 13:42:13 +00:00
9efcc535bc Cleaner drop from CUDA mode around Eigen includes. Remains difficult to let Eigen compile under nvcc with version issues. 2019-01-01 13:39:10 +00:00
231b61d012 std::array by default 2019-01-01 13:37:35 +00:00
e898f4f0b0 Whitespace 2019-01-01 13:36:55 +00:00
d5db5f5242 Wrong dimension used in a temporary 2018-12-20 10:49:45 +00:00
2fcedb13dd Step size modification in HMC; ICC happy thread pragmas 2018-12-20 09:32:33 +00:00
35ed1defac Passes make check now single and double compile 2018-12-19 11:09:32 +00:00
4e95accf80 Namespace fix 2018-12-15 21:46:17 +00:00
fd66325321 pure QED test and copyright update 2018-12-14 17:39:11 +00:00
c637c0c48c James H.'s code for general size Wilson loops 2018-12-14 17:37:09 +00:00
c4b472176c Photon code fix 2018-12-14 17:36:38 +00:00
422764757d Updates in tests to make all of Grid compile 2018-12-14 16:55:54 +00:00
943fa48ce4 Hadrons: Kl2 contraction using sequential propagators 2018-12-14 13:45:30 +00:00
fa97a56fdd Hadrons: sequential Aslash insertion on propagator 2018-12-14 12:40:26 +00:00
856476a890 big cleanup of the Photon class + QED Coulomb gauge 2018-12-13 21:52:38 +00:00
afc462bd58 Bracketing issue in macro 2018-12-13 10:53:22 +00:00
b57a4d32aa Merge branch 'develop' into feature/gpu-port 2018-12-13 05:11:34 +00:00
c509bd3fe2 Merge branch 'feature/resilient-io' into develop 2018-12-01 12:57:43 +00:00
49b934310b resilient I/O fix 2018-11-27 20:17:09 +00:00
01e8cf5017 Merge branch 'develop' into feature/resilient-io 2018-11-27 19:09:59 +00:00
12f4499502 HDF5 serialiser fix 2018-11-27 19:09:50 +00:00
05aec72887 Hadrons: application parameter for resilient I/O 2018-11-27 18:46:43 +00:00
136d3802cb binary parallel IO can do read tests and eventually re-write in case of failure 2018-11-27 18:38:24 +00:00
a4c55406ed checksummed HDF5 IO 2018-11-27 17:43:19 +00:00
c7f33ca2a8 Revert "Hadrons: A2A vector write can fail and retry"
This reverts commit 10fc263675.
2018-11-27 17:27:26 +00:00
0e3035c51d Revert "optional non-fatal checksum fail in Lime lattice read (with error codes)"
This reverts commit bccfd4cbb3.
2018-11-27 17:27:20 +00:00
10fc263675 Hadrons: A2A vector write can fail and retry 2018-11-26 19:47:03 +00:00
bccfd4cbb3 optional non-fatal checksum fail in Lime lattice read (with error codes) 2018-11-26 19:45:51 +00:00
0b50d4a328 log time fix 2018-11-23 15:51:27 +00:00
b74940b3d4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-11-23 14:08:29 +00:00
e232257cb6 Hadrons: A2AAslashVector modul cleaning and renaming 2018-11-22 19:43:49 +00:00
09451b5e48 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-11-22 15:45:24 +00:00
6364aa8acf Merge branch 'feature/contractor' into develop 2018-11-22 15:44:46 +00:00
b9e84ecab7 Hadrons: minor code cleaning 2018-11-22 15:44:30 +00:00
41032fef44 Optional RW mode for Hdf5Reader 2018-11-21 18:36:50 +00:00
d77bc88170 Optional support for faster CRC32C checksum through Intel IPP 2018-11-19 17:21:53 +00:00
494b3c9e57 Hadrons: contractor more IO fix 2018-11-19 16:26:53 +00:00
2ba19a9e07 Hadrons: contractor IO fix 2018-11-19 16:17:51 +00:00
5d7cc29eaf Hadrons: contractor token @traj@ for trajectory number in input file 2018-11-19 16:04:01 +00:00
f22a27d7f9 Hadrons: contractor trajectory loop and file output 2018-11-19 15:45:04 +00:00
33a0bbb17b Const correctness 2018-11-19 11:27:57 +00:00
f592ec8baa Hadrons: contractor performance fix 2018-11-16 20:59:49 +00:00
8b007b5c24 Hadrons: remove the use of OpenMP reductions 2018-11-16 20:00:29 +00:00
17b3f47b1e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-11-16 16:32:12 +00:00
9bb170576d Merge pull request #177 from guelpers/develop
Hadrons module to electrify a gauge
2018-11-14 16:04:09 +00:00
a7e3977b75 Merge remote-tracking branch 'upstream/develop' into develop 2018-11-13 14:56:23 +00:00
995f20e45d Hadrons: some renamings 2018-11-13 14:54:48 +00:00
d058b4e681 Merge branch 'feature/seqA2A' into develop 2018-11-13 13:27:24 +00:00
2ac57370f1 Hadrons: contractor translation average normalisation 2018-11-12 16:04:35 +00:00
344e832a4e Hadrons: contractor faster transpose and finer timings 2018-11-12 15:59:54 +00:00
cfe281f1a4 Hadrons: diskvectors measure hash performance in debug output 2018-11-12 15:59:11 +00:00
f5422c7334 Hadrons: more contractor instrumentation 2018-11-09 16:23:53 +00:00
68c76a410d Hadrons: more contractor improvements 2018-11-08 19:24:29 +00:00
69b6ba0a73 Hadrons: contractor fixes and improvements 2018-11-08 18:46:28 +00:00
65349b07a7 Hadrons: simpler A2A perf functions 2018-11-08 18:44:44 +00:00
7cd9914f0e Hadrons: automatically resize output in MKL A2A matrix kernels 2018-11-08 17:40:57 +00:00
8ef4657805 Merge remote-tracking branch 'upstream/develop' into feature/seqA2A 2018-11-08 09:00:06 +00:00
78c1086f8b Hadrons: sequential Aslash insertion and propagator on A2A vector 2018-11-08 08:58:09 +00:00
1ff1422e07 Hadrons: contractor lighter output 2018-11-07 20:02:53 +00:00
32376f0437 Hadrons: contractor performances 2018-11-07 19:59:11 +00:00
0c6e581336 Hadrons: first stab at general contraction code, needs serious testing 2018-11-07 19:16:55 +00:00
e0a79a5bbf Hadrons: PR#177: Electrify gauge: Single Precision fix 2018-11-07 15:01:22 +00:00
4c016cc1a4 Merge remote-tracking branch 'upstream/develop' into develop 2018-11-07 14:03:12 +00:00
88d9922e4f Hadrons: fast A2A matrix contraction kernels 2018-11-06 19:49:09 +00:00
9734e3ee58 Hadrons: (somewhat) faster build 2018-11-06 19:47:41 +00:00
f1382cf81d Merge remote-tracking branch 'upstream/develop' into develop 2018-11-06 10:29:52 +00:00
85699daef2 Hadrons: Module to electrify a gauge field 2018-11-06 10:27:18 +00:00
1651111d18 Hadrons: final, portable form of the contractor benchmark 2018-11-05 21:29:13 +00:00
1ed4ea344d Merge branch 'develop' into feature/contractor 2018-11-05 11:42:02 +00:00
4a7415e83c Hadrons: contractor benchmark update 2018-10-23 21:00:54 +01:00
0ffcfea724 Hadrons: contractor benchmark 2018-10-23 17:08:16 +01:00
b48611b80f Merge branch 'develop' into feature/contractor 2018-10-22 18:27:18 +01:00
7d84dca8e9 Merge branch 'develop' into feature/contractor 2018-10-18 23:46:58 +01:00
f31d6bfec2 Hadrons: contractor cleaning and better error check 2018-10-18 17:50:35 +01:00
a7cfa26901 Hadrons: reverse A2A matrix load for better DiskVector cache reuse 2018-10-18 17:50:16 +01:00
f333f3e575 Hadrons: DiskVector save-on-eviction and faster CRC32 for Eigen matrices 2018-10-18 17:48:25 +01:00
f709329d96 Hadrons: first version of a contractor utility 2018-10-17 20:26:48 +01:00
f05b25dae4 Hadrons: A2AMatrix load 2018-10-17 20:26:26 +01:00
3e1d268fa3 Hadrons: DiskVector optimisation 2018-10-17 20:25:32 +01:00
dac9f8622e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-10-08 10:12:11 +01:00
d9de8fd5c9 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-09-17 12:19:47 +01:00
7e3647246c Merge branch 'master' of https://github.com/paboyle/Grid into develop 2018-09-17 12:19:20 +01:00
adbdc4e65b Half comms not working on GPU yet, so disable. 2018-09-11 05:15:22 +01:00
e4deea4b94 Weird bug appears with Vector<Vector<>>.
"fix" with std::vector<Vector<>>

Lies in the face table code. But think there is some latent problem.
Possibly in my allocator since it is caching, but could simplify or eliminate the caching
option and retest. One to look at later.
2018-09-11 04:36:57 +01:00
94d721a20b Comments on further topology discovery work 2018-09-11 04:20:04 +01:00
7bf82f5b37 Offload the face handling to GPU 2018-09-10 11:28:42 +01:00
f02c7ea534 Peer to peer on GPU's setup 2018-09-10 11:26:20 +01:00
bc503b60e6 Offloadable gather code 2018-09-10 11:21:25 +01:00
704ca162c1 Offloadable compression 2018-09-10 11:20:50 +01:00
b5329d8852 Protect against zero length loops giving a kernel call failure 2018-09-10 11:20:07 +01:00
f27b9347ff Better unquiesce MPI coverage 2018-09-10 11:19:39 +01:00
b4967f0231 Verbose and error trapping cleaner 2018-09-09 14:28:02 +01:00
6d0f1aabb1 Fix the multi-node path 2018-09-09 14:27:37 +01:00
f4bfeb835d Drop back to smaller Ls 2018-09-09 14:25:06 +01:00
394b7b6276 Verbose decrease 2018-09-09 14:24:46 +01:00
da17a015c7 Pack the stencil smaller for 128 bit access 2018-07-23 06:12:45 -04:00
1fd08c21ac make simd width configure time option for GPU 2018-07-23 06:10:55 -04:00
28db0631ff Hack to force 128bit accesses 2018-07-23 06:10:27 -04:00
b35401b86b Fix CUDA_ARCH. Need to simplify. See when new eigen release happens 2018-07-23 06:09:33 -04:00
a0714de8ec Define vector length for GPU 2018-07-23 06:09:05 -04:00
21a1710b43 Verbose vector length 2018-07-23 06:08:39 -04:00
b2b5137d28 Finally starting to get decent performance on Volta 2018-07-13 12:06:18 -04:00
2cc07450f4 Fastest option for the dslash 2018-07-05 09:57:55 -04:00
c0e8bc9da9 Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 07:10:25 -04:00
b1265ae867 Prettify code 2018-07-05 07:08:06 -04:00
32bb85ea4c Standard extractLane is fast 2018-07-05 07:07:30 -04:00
ca0607b6ef Clearer kernel call meaning 2018-07-05 07:06:15 -04:00
19b527e83f Better extract merge for GPU. Let the SIMD header files define the pointer type for
access. GPU redirects through builtin float2, double2 for complex
2018-07-05 07:05:13 -04:00
4730d4692a Fast lane extract, saturates bandwidth on Volta for SU3 benchmarks 2018-07-05 07:03:33 -04:00
1bb456c0c5 Minor GPU vector width change 2018-07-05 07:02:04 -04:00
4b04ae3611 Printing improvement 2018-07-05 06:59:38 -04:00
2f776d51c6 Gpu specific benchmark saturates memory. Can enhance Grid to do this for expressions,
but a bitof (known) work.
2018-07-05 06:58:37 -04:00
3a50afe7e7 GPU dslash updates 2018-06-27 22:32:21 +01:00
f8e880b445 Loop for s and xyzt offlow 2018-06-27 21:49:57 +01:00
3e947527cb Move looping over "s" and "site" into kernels for GPU optimisatoin 2018-06-27 21:29:43 +01:00
31f65beac8 Move site and Ls looping into the kernels 2018-06-27 21:28:48 +01:00
38e2a32ac9 Single SIMD lane operations for CUDA 2018-06-27 21:28:06 +01:00
efa84ca50a Keep Cuda 9.1 happy 2018-06-27 21:27:32 +01:00
5e96d6d04c Keep CUDA happy 2018-06-27 21:27:11 +01:00
df30bdc599 CUDA happy 2018-06-27 21:26:49 +01:00
7f45222924 Diagnostics on memory alloc fail 2018-06-27 21:26:20 +01:00
dd891f5e3b Use NVCC to suppress device Eigen 2018-06-27 21:25:17 +01:00
2881b3e8e5 WilsonMG: Remove unnecessary static assertions 2018-06-26 14:42:30 +02:00
cc5d025ea4 WilsonMG: Adapt staggered GMRES/MR tests to "new" constructor 2018-06-18 16:20:20 +02:00
6c97a6a071 Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
73bb2d5128 Ugly hack to speed up compile on GPU; we don't use the hand kernels on GPU anyway so why compile 2018-06-13 20:35:28 +01:00
b710fec6ea Gpu code first version of specialised kernel 2018-06-13 20:34:39 +01:00
b2a8cd60f5 Doubled gauge field is useful 2018-06-13 20:27:47 +01:00
867ee364ab Explicit instantiation hooks 2018-06-13 20:27:12 +01:00
25becc9324 GPU tweaks for benchmarking; really necessary? 2018-06-13 20:26:07 +01:00
94d1ae4c82 Some prep work for GPU shared memory. Need to be careful, as will try GPU direct
RDMA and inter-GPU memory sharing on SUmmit later
2018-06-13 20:24:06 +01:00
2075b177ef CUDA_ARCH more carefule treatment 2018-06-13 20:22:34 +01:00
847c761ccc Move sfw IEEE fp16 into central location 2018-06-13 20:22:01 +01:00
8287ed8383 New GPU vector targets 2018-06-13 20:21:35 +01:00
e6be7416f4 Use managed memory 2018-06-13 20:14:00 +01:00
26863b6d95 User Managed memory 2018-06-13 20:13:42 +01:00
ebd730bd54 Adding 2D loops 2018-06-13 20:13:01 +01:00
066be31a3b Optional GPU target SIMD types; work in progress and trying experiments 2018-06-13 20:07:55 +01:00
7a4c142955 Add GPU specific simd targets 2018-06-13 19:55:30 +01:00
ddcb53bce2 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-06-13 09:50:37 +02:00
d1c80e1d46 WilsonMG: Correct years in copyright line 2018-06-13 09:44:09 +02:00
c73cc7d354 WilsonMG: Add tests with MG preconditioner running single precision, outer solver running in double 2018-06-12 16:10:48 +02:00
49fdc324a0 WilsonMG: Make MG correctness checks abort on failing tests 2018-06-12 16:10:48 +02:00
f32714a2d1 WilsonMG: Make running MG correctness checks optional via commandline 2018-06-12 16:10:48 +02:00
73a955be20 WilsonMG: Move tests for Wilson & WilsonClover into separate files 2018-06-12 16:10:48 +02:00
66b7a0f871 WilsonMG: Move multigrid class to separate file 2018-06-12 16:10:48 +02:00
2ab9d4bc56 WilsonMG: Fix random behavior in GMRES
From time to time I saw random since the basis vectors were not initialized
properly.
2018-06-12 15:01:31 +02:00
4f41cd114d WilsonMG: Add a mixed precision version of FGMRES
This version does everything in double prec but accepts a preconditioner working
in single precision.
2018-06-12 15:01:31 +02:00
11c4f5e32c WilsonMG: Provide command line switch for reading in input xml + move default params to constructor of MultiGridParams 2018-06-12 15:01:31 +02:00
e9b9550298 WilsonMG: Fix incompatibility with single prec MG in construction of simd layout on coarser grids 2018-06-12 15:01:31 +02:00
7564fedf68 WilsonMG: Set subspace to zero to avoid random behavior 2018-06-12 15:01:31 +02:00
251b904a28 Merge branch 'release/ISC-freeze-2' 2018-06-04 21:09:48 +01:00
5a112feac3 Merge branch 'release/ISC-freeze-1' 2018-06-04 18:49:40 +01:00
6c27c72585 WilsonMG: Provide more sensible default values for MG parameters 2018-05-16 17:26:09 +02:00
9c003d2d72 WilsonMG: Base wilson mg preconditioner entirely on existing infrastructure 2018-05-16 17:26:09 +02:00
4b8710970c WilsonMG: Switch to Galerkin coarsening in CoarsenedMatrix 2018-05-16 17:26:09 +02:00
68d686ec38 WilsonMG: Add functionality for applying G5 on coarse grids 2018-05-16 16:17:14 +02:00
c48b69ca81 WilsonMG: Implement Mdir & Mdiag in CoarsenedMatrix 2018-05-16 16:08:05 +02:00
df8c208f5c WilsonMG: Revert CoarsenedMatrix.h and Lattice_transfer.h back to state of develop branch 2018-05-16 16:02:54 +02:00
61812ab7f1 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-05-15 14:57:18 +02:00
eb7d34a4cc GPU version 2018-05-14 19:41:47 -04:00
aab27a655a Start of GPU kernels 2018-05-14 19:41:17 -04:00
93280bae85 Gpu option 2018-05-14 19:40:58 -04:00
c5f93abcd7 GPU clean up 2018-05-14 19:40:33 -04:00
d5deef782d Useful debug comments 2018-05-14 19:39:52 -04:00
5f50473c0d Clean up 2018-05-14 19:39:11 -04:00
13f50406e3 Suppress print statement 2018-05-12 18:00:00 -04:00
09cd46d337 Lane by Lane operation 2018-05-12 17:59:35 -04:00
d3f51065c2 Give command line control of blocks/threads split 2018-05-12 17:58:56 -04:00
925ac4173d Thread count control for warp scheduler thingy doodaa thing 2018-05-12 17:58:22 -04:00
eb921041d0 Perf count control 2018-05-12 17:57:32 -04:00
87c5c0271b Ficxing eigen 2018-04-16 19:08:07 -04:00
a3f5a13591 Better Eigen handling 2018-04-16 18:02:55 -04:00
9fe28f00eb Eigen sim link off head revision 2018-04-16 17:54:46 -04:00
a8a0bb85cc Control scalar execution or vector under generic. Disable Eigen vectorisation on powerpc / SUmmit 2018-04-12 12:32:57 -04:00
6411caad67 work distribution 2018-04-12 11:41:41 -04:00
7533035a99 Control Eigen vectorisatoin 2018-04-12 11:40:56 -04:00
73ced656eb Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-04-03 17:51:11 +02:00
f69008edf1 WilsonMG: Add functionality to report timings to MG preconditioner 2018-04-03 17:26:49 +02:00
57a49ed22f WilsonMG: Read in MG parameters from xml in test 2018-04-03 16:03:11 +02:00
ff6413a764 WilsonMG: Make number of levels chooseable at runtime
I don't like this solution though :(
2018-04-03 15:57:33 +02:00
2530bfed01 WilsonMG: Move params instance from global scope to test main function 2018-04-03 14:50:48 +02:00
74f79c5ac7 Revert "Add function to return full type as std::string"
This reverts commit 1cb745c8dc.
2018-03-29 12:03:50 +02:00
58c30c0cb1 WilsonMG: Add conformability checks in MG preconditioner 2018-03-28 13:24:39 +02:00
917a92118a WilsonMG: Move operator test to MG testing routine 2018-03-28 12:19:25 +02:00
04f9cf088d WilsonMG: Add more parameters to MultiGridParams struct 2018-03-27 17:13:11 +02:00
99107038f9 WilsonMG: Rationalize the level counting strategy 2018-03-27 17:06:33 +02:00
b78456bdf4 WilsonMG: Get rid of explicit include of GCR header 2018-03-26 15:41:53 +02:00
08543b6b11 WilsonMG: Provide a switch between V- and K-cycle 2018-03-26 15:37:17 +02:00
63ba33371f WilsonMG: Some minor refactoring 2018-03-26 15:34:53 +02:00
683a7d2ddd WilsonMG: Move comment to make clang-format happy 2018-03-26 14:59:40 +02:00
b15db11c60 Kernels -> pure static object to enable device execution 2018-03-24 19:35:20 -04:00
f6077f9d48 Kernels -> not instantiaed otherwise object ref on GPU 2018-03-24 19:33:44 -04:00
572954ef12 Kernels not an instantiated object, just static 2018-03-24 19:33:13 -04:00
cedeaae7db Lebesge -> StencilView if necessary 2018-03-24 19:32:41 -04:00
e6cf0b1e17 View typedefs go to OperatorImpl 2018-03-24 19:32:11 -04:00
5412628ea6 begin end lamda 2018-03-24 19:31:45 -04:00
1f70cedbab Have to make all kernel called routines static since object reference will be a host pointer on GPU 2018-03-24 19:29:26 -04:00
b50f37cfb4 Remove overlap comms flag 2018-03-24 19:28:53 -04:00
cb0d2a1b03 threaded rng init; I thought this was on 2018-03-24 19:28:17 -04:00
6fe9b28a82 Cosmetic 2018-03-24 19:27:14 -04:00
b002587d7c Simplify 2018-03-24 19:26:44 -04:00
6c08385782 Simplify 2018-03-24 19:26:19 -04:00
afdcbf79d1 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-03-23 21:13:50 +01:00
3c3ec4e267 WilsonMG: Move tests for Wilson & WilsonClover into the same file 2018-03-23 21:12:27 +01:00
bbe1d5b49e WilsonMG: Temporarily use GMRES in construction of basis vectors
This can go back to CG once Mdag in CoarsenedMatrix works.
2018-03-23 20:02:27 +01:00
0f6009a29f WilsonMG: Huge refactor into something that could be considered an algorithm 2018-03-23 19:55:43 +01:00
1cfed3de7c WilsonMG: Add new logger for MG 2018-03-23 19:55:16 +01:00
4e1272fabf Kernels need to be static to work on GPU. No reference to host resident data 2018-03-22 18:44:53 -04:00
607dc2d3c6 Remove lebesgue order 2018-03-22 18:23:09 -04:00
23c880b009 Remove lebesgue order; stick in stencil if need 2018-03-22 18:13:41 -04:00
334bb6792f Lebesgue order removed. Stick in the stencil view 2018-03-22 18:12:12 -04:00
a3690071b4 Warm up GPu 2018-03-22 18:05:20 -04:00
299d119013 GPU work allocation improved 2018-03-22 18:04:24 -04:00
55be842d23 Dont force l1p.h so early 2018-03-22 18:01:43 -04:00
edbc0d49d7 WilsonMG: Get rid of explicit GridTypeMappers in CoarsenedMatrix 2018-03-22 16:38:24 +01:00
9875c446c6 Clean up pragmas 2018-03-20 07:19:17 -04:00
9c25eb35ca Eigen develop branch for now 2018-03-20 07:18:56 -04:00
5ac96dbdc6 Warm behaviour in SU3 benchmark 2018-03-20 07:18:31 -04:00
5cc9aca85d Use 64bit index for looping 2018-03-20 06:34:52 -04:00
ac29ebcb95 Clean up debug prints 2018-03-20 06:33:59 -04:00
a5cfb89304 Update eigen process direct from develop on github. Dangerous, but needed from GPU 2018-03-19 07:20:48 -04:00
f04a7251cc Gpu welcome message and device info 2018-03-19 07:12:12 -04:00
d4ce7d9905 GPU friendly Stencil needs a view 2018-03-19 07:11:21 -04:00
8a1d303ab9 GPU friendly stencil improvements 2018-03-19 07:11:03 -04:00
bf0a4de919 GPU friendly params object 2018-03-19 07:10:12 -04:00
6fe5885fe4 Warning suppress 2018-03-19 07:09:49 -04:00
17ac309e84 Fix the compile 2018-03-19 07:08:59 -04:00
7467a1c027 Latest eigen needed for GPU 2018-03-19 07:08:10 -04:00
fdfb8a26a8 Disable eigen vectorisation on GPU because of Summit compile issues 2018-03-19 07:07:30 -04:00
2df4e422ad Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2018-03-18 14:45:41 +00:00
3a3e3cac40 Pull the trigger on offload 2018-03-18 14:45:29 +00:00
b1c02ec310 MallocManaged in GPU 2018-03-18 14:44:46 +00:00
38eadee2c9 Prettier code 2018-03-18 14:44:22 +00:00
42c70437be Views 2018-03-18 14:43:47 +00:00
65274b4d7f Tidy up 2018-03-18 14:43:16 +00:00
ee5cf6c8c5 WilsonMG: Some minor changes to GMRES implementations 2018-03-16 13:10:45 +01:00
7e8be32755 Typo fix 2018-03-13 19:22:31 -04:00
ff761ea4e6 Bound check improvement 2018-03-09 20:00:46 +00:00
a31d3e60d8 Better bounds check 2018-03-09 18:10:21 +00:00
a66cecc509 WilsonMG: Fix invalid call to MR ctor 2018-03-09 17:34:29 +01:00
0f6cdf3d4b WilsonMG: Implement missing parts of CoarsenedMatrix 2018-03-09 16:56:16 +01:00
1e63b73a14 WilsonMG: Some cleanup/formatting 2018-03-09 16:50:19 +01:00
4d60b92b7f Update oSites 2018-03-08 21:00:25 +00:00
c159c70c84 View introduced 2018-03-08 14:58:04 +00:00
28b5572755 Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2018-03-08 13:01:42 +00:00
5fac7080bc Adding -fno-strict-aliasing by default 2018-03-08 13:01:24 +00:00
4548523ecc This modification eliminates what looks like a compiler bug
on Intel 2017.
2018-03-08 04:41:16 -08:00
4154fc6f44 Revert a change 2018-03-07 16:54:11 +00:00
4e3458516a Reverting after fixing issue with extract merge 2018-03-07 16:50:13 +00:00
90a2efb9b3 Hit an annoying strict alias optimisation in GCC 4.9 through 6.3
Chris K was correct. It appears that an additional memcpy (UGHHH) is enough
to suppress the compiler
2018-03-07 07:27:26 -08:00
40699221e2 Dont alias lhs and rhs in a where statement 2018-03-06 04:14:13 -08:00
3cb1b545d0 Don't alias the variables with a where statement. 2018-03-06 04:13:26 -08:00
e199ba7e88 Fix the Charge conjugate BC's 2018-03-05 13:59:02 +00:00
4d53703c67 Scalar type differeing allowed, eg. precisoin change 2018-03-05 11:39:52 +00:00
d506c59efa Warnings disabled 2018-03-05 11:39:20 +00:00
44188a5c6f AVX512 fix 2018-03-05 00:32:24 +00:00
2018077770 Make NVCC happy with the compile. This is warning free on 9.1 on my laptop (both make and make tests). 2018-03-05 00:28:24 +00:00
984e06e2b5 Introduce view objects that can safely be copied to GPU for access 2018-03-04 16:40:11 +00:00
aead94e9a7 View introduced 2018-03-04 16:39:29 +00:00
3277bda130 View introduction to prepare for accelerator offload.
Probably same problem exists for stencil object
2018-03-04 16:38:08 +00:00
442b0b406c View related changes 2018-03-04 16:34:14 +00:00
8824a54269 View related changes 2018-03-04 16:33:33 +00:00
c03423250f Indexable changes 2018-03-04 16:31:35 +00:00
317fd0da44 Views introduced. Need to accelerator offload these routines. 2018-03-04 16:30:45 +00:00
783795a44a Views introduced 2018-03-04 16:12:49 +00:00
0e6197fbed Introduce accelerator friendly expression template rewrite.
Must obtain and access lattice indexing through a view object that is safe
to copy construct in copy to GPU (without copying the lattice).
2018-03-04 16:03:19 +00:00
dad7862f91 Go through a view object that can be copied to GPU 2018-03-04 16:02:02 +00:00
c89a883448 where was deprecated and integrated to ET engine a long time ago. Remove dead old original code 2018-03-04 15:58:02 +00:00
c204288fbc Remove a couple of print statements 2018-03-04 15:57:15 +00:00
ad739f042a Introduce views for passing lattice indexing to accelerators. 2018-03-04 15:56:14 +00:00
db988301d0 Introduce view objects for indexing lattices. Used to pass the view to acccelerators 2018-03-04 15:55:16 +00:00
9b1f29c4c2 Support a view for passing to accelerator 2018-03-04 15:54:35 +00:00
e5ea04ee0c Need to support precision change, and real replication in multiple simd lanes 2018-03-04 15:53:04 +00:00
c92a3c6068 Need to support any vector type template and run on accelerator 2018-03-04 15:52:14 +00:00
03f8da8fbc enable-debug option for debug flags in compile 2018-03-04 15:51:47 +00:00
78a9e31ff0 options more obvious 2018-02-24 22:26:32 +00:00
c1fc947bb8 Coordinate handling GPU friendly + some GPU merge/extract improvements 2018-02-24 22:26:10 +00:00
ff7b19a71b Coordinate handling GPU ready avoid malloc 2018-02-24 22:25:39 +00:00
1c16ffa1c1 Coordinate GPU ready. No malloc 2018-02-24 22:25:09 +00:00
4962f59477 Eliminate both GPU issue and threading bottle neck by avoiding malloc in coordinate handling 2018-02-24 22:24:37 +00:00
e158b60bce GPU friendly coords 2018-02-24 22:23:47 +00:00
34820bec27 Coordinate handling GPU ready. No malloc 2018-02-24 22:23:18 +00:00
eed9aa9f0c Extract merge gpu ready 2018-02-24 22:23:01 +00:00
8792ff6439 Coordinate handling gpu ready 2018-02-24 22:22:43 +00:00
078901278c Coordinate handling gpu friendly 2018-02-24 22:22:02 +00:00
bf5fb89aff Coordinate handling GPU friendly 2018-02-24 22:21:36 +00:00
7574c18cef Massive clean up extract merge.
Simpler and GPU friendly
2018-02-24 22:21:08 +00:00
36ea5f6b77 gpu friendly coordinates ; no std::vector on GPU 2018-02-24 22:20:14 +00:00
285deab432 Coordinate handling GPU friendly. Avoid std::vector 2018-02-24 22:19:28 +00:00
bb7d87d0a0 Coordinate handling gpu friendly 2018-02-24 22:18:33 +00:00
6ab60c5b70 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-02-08 23:59:07 +01:00
8c692b7ffd WilsonMG: Comment assertion on hermiticity of coarse operator for now
TODO: Think of a way to not break dwf_hdcr by doing that. It's only an assertion
but it still interferes with it.
2018-02-08 23:55:05 +01:00
2976132bdd Add first version of multigrid for wilson clover analogous to wilson one
Just like the wilson one, this algorithm

• is currently only a 2-level method since I don't have correct implementations
  for Mdir and Mdiag in CoarsenedMatrix yet (needed for further coarsening)
• needs levelization and refactoring into a proper algorithm
2018-02-08 23:52:10 +01:00
48177f2f2d Add tests for all MR|GMRES solvers with wilson clover action 2018-02-08 23:52:09 +01:00
c4ce70a821 WilsonMG: Major cleanup 2018-02-08 23:52:08 +01:00
a3e009ba54 Add tests for CAGMRES solvers with staggered action 2018-02-08 17:46:28 +01:00
eb7cf239d9 Print warning messages in CAGMRES solvers
Currently, the implementation of these algorithms doesn't differ from their non
communication-avoiding versions.
2018-02-08 17:43:47 +01:00
13ae371ef8 Make solver parameters match in all MR|GMRES solver tests 2018-02-08 17:33:10 +01:00
9f79a87102 Fix bugs in Flexible GMRES solvers
Somehow I got the left and right-preconditioned versions of GMRES mixed up. As
of now this is right-preconditioned version, which is what we want.
2018-02-08 16:00:31 +01:00
4ded1ceeb0 Make GMRES solvers perform no more than MaxIterations steps
I noticed that it was possible to overrun this number.
2018-02-08 15:29:44 +01:00
8bc12e0ce1 Remove superfluous comments in MR solver 2018-02-07 18:09:09 +01:00
cc2f00f827 Remove test for MR solver with dwf action as it doesn't converge 2018-02-07 18:09:08 +01:00
cd61e2e6d6 Increase max iterations in test of MR solver with staggered action 2018-02-07 18:09:07 +01:00
323ed1a588 Add an overrelaxation parameter to the MR solver 2018-02-07 18:09:06 +01:00
68c66d2e4b Remove empty line in output of *Residual* solvers 2018-02-07 18:08:56 +01:00
1671adfd49 WilsonMG: Add some tests for linear operators 2018-02-07 17:15:22 +01:00
b9b5bdfc3a Proper offload (accelerator access) will require a mutable copy lambda. 2018-02-02 11:38:19 +00:00
51eb2c5dfc Make referencign the stencil and all info required to evaluate the kernel
accelerator marked up
2018-02-02 11:37:13 +00:00
ede0dff794 Mark up as an accelerator function 2018-02-02 11:36:44 +00:00
aa6de818e2 Copy data needed by Kernels out of the grid object to avoid host reference 2018-02-02 11:36:11 +00:00
dcf6517a93 Accelerator offload and copy Opt into the kernel for GPU host var safety 2018-02-02 11:35:35 +00:00
a308dff410 accelerator loop, copy Opt into the GPU 2018-02-02 11:34:37 +00:00
14ba20898a Accelerator loop the key kernel call 2018-02-02 11:30:07 +00:00
a53d3ee19a Add Opt to the lambda capture to get it into the GPU 2018-02-02 11:28:39 +00:00
5df435319d Use constexpr 2018-02-02 11:27:56 +00:00
0da2d3e222 accelerator off load some more stuff 2018-02-02 11:27:35 +00:00
9c9dfbfa78 Force accelerator 2018-02-02 11:25:09 +00:00
e4df025d01 Accelerator related 2018-02-01 23:20:05 +00:00
cfeda9d536 constexpr on const ints 2018-02-01 22:59:12 +00:00
4450b1993a Offload 2018-02-01 22:45:47 +00:00
d03ce5c2a4 Provide a way to get around std::vector for a known type on device.
Use template specialisation to access a private member in the Clang++ STL implementation
2018-02-01 22:44:25 +00:00
7d6522c1ef Accelerator inline 2018-02-01 22:43:56 +00:00
b96832a922 Accelerator inline 2018-02-01 22:43:26 +00:00
5d7af47b05 accelerator_inline 2018-02-01 22:42:54 +00:00
053ef25c90 constexpr makes GPU happy 2018-02-01 22:42:29 +00:00
8ae77d3706 Small simplification of FermionOperatorImpl towards GPU but not there yet 2018-02-01 22:41:54 +00:00
871649238c WilsonMG: Stricter naming for linear operators 2018-02-01 14:43:08 +01:00
7c86d2085b WilsonMG: Some minor cleanup 2018-02-01 12:24:16 +01:00
9292be0b69 WilsonMG: Add check for Mdiag + Σ Mdir == M
Need to test my implementations of CoarsenedMatrix::Mdiag &
CoarsenedMatrix::Mdir.
2018-01-31 14:03:30 +01:00
10141f90c9 WilsonMG: Rename test file 2018-01-30 10:25:09 +01:00
a414430817 Merge remote-tracking branch 'upstream/develop' into feature/ddalphaamg 2018-01-29 18:32:31 +01:00
f20728baa9 WilsonMG: Some further steps towards a three level method
Currently this is very "manual" as we are still testing stuff. Will refactor
and make it an algorithm once everything works.

What currently does work:

  - All tests in MultiGridPreconditioner::runChecks for the first coarse grid
  - The tests for the intergrid operators going from the first to the second
    coarse grid
    - (1 - P R) v   == 0
    - (1 - R P) v_c == 0
  - A full solve with VPGCR and a two-level MG preconditioner

What hinders the rest of the tests from passing with a three-level method is the
absence of implementations of CoarsenedMatrix::Mdir and CoarsenedMatrix::Mdiag.
2018-01-29 18:29:49 +01:00
d2e68c4355 WilsonMG: Perform some minor cleanup 2018-01-29 18:07:10 +01:00
1cb745c8dc Add function to return full type as std::string
Also works for nested templates. I find it useful for debugging.
Possible usage:

std::cout << "getTypename<AType>() = " << getTypename<Atype>() << std::endl;
std::cout << "getTypename<decltype(AnInstance)>() = " << getTypename<decltype(AnInstance)>() << std::endl;
2018-01-29 17:39:19 +01:00
faf4278019 Use 2 passes of GS in coarse operator construction 2018-01-29 17:21:42 +01:00
194e4b94bb Make MG checking function work level-wise 2018-01-29 17:18:20 +01:00
bfc1411c1f Use more iterations in subspace creation 2018-01-29 17:11:29 +01:00
161637e573 Turn on orthogonality checking temporarily 2018-01-29 17:10:05 +01:00
79b50feacf fixme updates 2018-01-29 16:00:40 +00:00
c67c1544cd abs no compile on travis fix attempt 2018-01-28 10:26:04 +00:00
e657f9a344 OMP collapse changes to make NVCC happy 2018-01-28 01:21:53 +00:00
b6ebf35af5 Intel compiler doesn't like Nvidia error disable pragmas 2018-01-28 01:03:10 +00:00
604c05f4b8 parallel_for elimination -> thread_loop 2018-01-28 01:01:36 +00:00
70e276e1ab parallel_for elimination -> thread_loop 2018-01-28 01:01:14 +00:00
9472b02771 Parallel_for elimination -> thread_loop. 2018-01-28 01:00:55 +00:00
9597ab94eb Zero changes, swap on lattice type. 2018-01-27 23:51:40 +00:00
ce4da83bc2 Zero changes, literally 2018-01-27 23:51:10 +00:00
d557f3ef77 Zero changes (literally) and also a warning elimination 2018-01-27 23:50:43 +00:00
f574c20118 Zero changes, __VA_ARGS__ and swap 2018-01-27 23:50:17 +00:00
f102897385 VA_ARGS to make comma safe automatic 2018-01-27 23:49:47 +00:00
d6fce3e498 Zero changes, literally 2018-01-27 23:48:01 +00:00
2d0bcc2606 Zero changes, acceleartor on kernels and some thread loop changes 2018-01-27 23:47:38 +00:00
45df59720e Zero changes and VA_ARGS changes 2018-01-27 23:46:58 +00:00
44ef5bc207 Zero changes (literally speaking). 2018-01-27 23:46:28 +00:00
98af36217a Zero changes. (I mean literally) 2018-01-27 23:46:02 +00:00
be7b37b9c9 Mistake on openmp 2018-01-27 00:05:11 +00:00
c4f82e072b _grid becomes private ; use Grid()§ 2018-01-27 00:04:12 +00:00
3f9654e397 Hiding internals 2018-01-26 23:09:03 +00:00
912b50f6fa Hiding lattice internals 2018-01-26 23:08:45 +00:00
2a4a0e43c1 Hide internals 2018-01-26 23:08:27 +00:00
32523a229c Hide internals 2018-01-26 23:08:02 +00:00
1ebd56c3fb Hide internal data 2018-01-26 23:07:34 +00:00
8dccffdfd5 Hide internal data 2018-01-26 23:06:51 +00:00
5642ea270f Hide internal data 2018-01-26 23:06:28 +00:00
43cea62855 Hide internal data 2018-01-26 23:06:03 +00:00
2b4067bb71 Hide internal data 2018-01-26 23:05:32 +00:00
85771e97e9 Hide internal data 2018-01-26 23:04:46 +00:00
8b371ffa94 Hide internal data 2018-01-26 23:03:54 +00:00
bf659dfd92 Hide the ._odata 2018-01-26 22:27:47 +00:00
76a4dd36d9 Fix no compile of test serialisation 2018-01-26 00:13:21 +00:00
f4010023ca Warning fixes 2018-01-25 23:46:47 +00:00
24a4589def Changes to interface a little 2018-01-25 23:37:34 +00:00
c904822e74 Warning removal 2018-01-25 23:37:15 +00:00
40ee1e1957 Zero() 2018-01-25 23:36:58 +00:00
461df78a3f Better to use Zero(), and not zero static data 2018-01-25 23:36:22 +00:00
db9c9475d4 const 2018-01-25 23:36:06 +00:00
214f7a6f13 Drop std::vector container for the lattice data 2018-01-25 23:35:04 +00:00
c844cfcda8 Remove commAllocator; make more simple; option to switch off the pointer caceh 2018-01-25 23:33:57 +00:00
a3e3034e6f Host compile 2018-01-25 23:33:00 +00:00
e7cba358c2 Temporary update to reflect the new dropping of std::vector in Lattice
Will update again to hide the internals in an interface
2018-01-25 23:31:41 +00:00
99329197ee Rename header to .h 2018-01-24 14:10:09 +00:00
421401af55 Remove IMCI as really don't support 2018-01-24 13:53:21 +00:00
0626c1e39e Accelerator flaggina dn thrust complex for NVCC 2018-01-24 13:50:41 +00:00
725f03e2e2 Accelerator markup and thrust complex on nvcc 2018-01-24 13:50:10 +00:00
65f77112e0 Thread loops done properly 2018-01-24 13:49:39 +00:00
408b868475 Generic for GPU needs accelerator markup of functions 2018-01-24 13:49:12 +00:00
1c797deb04 Accelerator tweaks 2018-01-24 13:43:43 +00:00
b9d5a42b57 Should be able to eliminate the COMMA_SAFE with VA_ARGS trick ; revisit this file 2018-01-24 13:42:06 +00:00
e737591918 Accelerator loops 2018-01-24 13:41:12 +00:00
ba5ea5830b Acceleartor loops 2018-01-24 13:40:56 +00:00
43f244badf Thread loops for now; figure out what can be GPU accelerated later here 2018-01-24 13:40:30 +00:00
e9c8ba5ef7 Accelerator loosp 2018-01-24 13:39:54 +00:00
d70709a8e8 Thread construct changes 2018-01-24 13:39:06 +00:00
733f8ff0b2 Still using parallel_for -- don't know how to implement reduction on GPU yet. Look at some sample code is best. 2018-01-24 13:38:13 +00:00
0bfa5bb213 Accelerator loosp 2018-01-24 13:37:26 +00:00
1f26a234f9 CPU loops explicit for peek poke 2018-01-24 13:36:31 +00:00
13f0116425 Accelerator loops 2018-01-24 13:35:55 +00:00
25f589b064 Accelerator loops 2018-01-24 13:35:36 +00:00
210c50a278 Accelerator prep work 2018-01-24 13:35:13 +00:00
549a143e78 Accelerator related 2018-01-24 13:34:46 +00:00
277301486d Simple warning elimination 2018-01-24 13:34:15 +00:00
c851b39a49 Nicer way of including aggregate 2018-01-24 13:33:34 +00:00
15cc12eb6c Delete the old non ET file 2018-01-24 13:33:07 +00:00
ae4f1f8c12 New file, split out two from Lattice_reduction 2018-01-24 13:32:43 +00:00
5609624b44 Threading constructs replaced 2018-01-24 13:32:24 +00:00
b5a947dd79 Change to make NVCC happy 2018-01-24 13:32:02 +00:00
ee16f62322 stray semicolon elimination. NVCC is picky, but eventually picked up these diags
with a pragma to suppress
2018-01-24 13:31:17 +00:00
3318de27d6 Thread macro changes 2018-01-24 13:30:23 +00:00
ac56965306 GPU changes and threading macros replaced 2018-01-24 13:28:30 +00:00
8e99264f40 Accelerator mark up of entire tensore space for offload 2018-01-24 13:27:30 +00:00
69327db9a9 Improviements for NVCC. Eigen is not compat with CUDA 9 and must hack to disable device
compilation
2018-01-24 13:25:07 +00:00
7331ee2d80 Warnings control to overpower the NVCC compiler 2018-01-24 13:24:36 +00:00
918c105c57 NVCC warning elimination 2018-01-24 13:23:59 +00:00
be1511d469 Remove old macros for threading 2018-01-24 13:23:24 +00:00
f1c31df9d2 updated Eigen version. Still didn't fix CUDA 9 no compile.
Worked around by switching off __NVCC__ during the include of Eigen and switching it
back on after. No Eigen code can be offloaded, note as a rsult of this. No harm done.
2018-01-24 13:19:29 +00:00
ff7b587fad Ugly... nvcc needs -x cu to compile .cc as cuda.
Since CXXFLAGS is Also passed to linker, and -x cu breaks link phase must replace
CXX and CXXLD with nvcc -x cu and nvcc -link respectively.
2018-01-24 13:18:19 +00:00
4e1135b214 Updated pugixml to v1.8; still didn't fix no compile under nvcc.
Turns out nvcc was right; must to an explicit template instantiation that was missing
but left gcc, icpc and clang happy for some reason.
Fix this.
2018-01-24 13:17:10 +00:00
acd4955a18 remove rdtsc on __NVCC__ as may be device called 2018-01-24 13:16:18 +00:00
bd08dc4f45 Pragma use for nvcc, warning elimination. 2018-01-24 13:15:43 +00:00
22d137d4e5 Namespace, nvcc warning elimination. 2018-01-24 13:14:43 +00:00
87ee592176 Pragma changes and layout and warning elimination for nvcc 2018-01-24 13:14:09 +00:00
063603b1ea Warning elimination 2018-01-24 13:12:14 +00:00
f292106db6 Split out pragms from threads.h;
More work needed; renam threads directory to "parallelism" or something like that
2018-01-24 13:11:04 +00:00
9d08aebea9 Compile through nvcc ; warning elimination fixes 2018-01-24 13:09:53 +00:00
4e30739093 First compile OK through nvcc on host 2018-01-24 13:08:47 +00:00
04f92ccddf WilsonMG: Provide a fix for the previous commit; compiles and runs successfully now
I don't like the solution with the temporary very much though ...
2018-01-22 14:56:48 +01:00
3b2d805398 WilsonMG: Some first steps towards coarse spin dofs; not compiling yet
A failing conversion from the innermost type (Grid::Simd<...>) to a coarse
scalar (triple iScalar) in blockPromote prohibits this commit from working.
2018-01-22 12:45:51 +01:00
9dc885d297 Fix a bug in Wilson MG
The calculation of the lattice size of a second coarse level was incorrect.
2018-01-18 17:02:04 +01:00
a70c1feecc Remove some unnecessary stuff in Wilson MG 2018-01-18 15:48:28 +01:00
38328100c9 Implement correctness checks for Wilson MG 2018-01-18 15:43:15 +01:00
9732519c41 Apply clang-format to Wilson MG
I can provide the configuration file I used if people want that.
2018-01-18 15:14:37 +01:00
fa4eeb28c4 Save current state in Wilson MG test file 2018-01-17 17:56:34 +01:00
90ea472411 Auto emacs format C++ no namespace indent 2018-01-15 11:44:54 +00:00
56999474e2 Indent 2018-01-15 11:44:45 +00:00
d74c21a386 GLobal edit for QCD namespace removal & NAMESPACE macros 2018-01-15 09:37:58 +00:00
ca6bdd7302 Useful drive to emacs C++ mode 2018-01-15 00:24:41 +00:00
6f20f1d224 Namespace 2018-01-15 00:24:20 +00:00
d0e357ef89 CLeanup and no QCD namespace 2018-01-15 00:23:51 +00:00
21251f2e1b Namespace and formatting changes 2018-01-15 00:21:27 +00:00
fcf1ccf669 Namespace, indent, badly formatted 2018-01-15 00:17:58 +00:00
49cce514f1 Namespace 2018-01-15 00:17:11 +00:00
695af98a1d Namespace, indent, tidy 2018-01-15 00:16:13 +00:00
f8cb46d360 Namspace, indent, badly formatted code fixed 2018-01-15 00:14:47 +00:00
0da64dea90 Namespace, indent 2018-01-15 00:13:32 +00:00
2cceebbf12 Namespace, indent 2018-01-15 00:12:20 +00:00
40232dcefe Namespce 2018-01-15 00:11:19 +00:00
dbd86bb95b CLeanup, namespace, indent 2018-01-15 00:10:11 +00:00
b8fd2c161f Indent, namespace 2018-01-15 00:09:33 +00:00
df9b979583 Indent, namespace 2018-01-15 00:08:40 +00:00
23ef0e3e19 Namespace and indentation 2018-01-15 00:07:46 +00:00
ae9175735a Indentation, Namespace 2018-01-15 00:07:10 +00:00
2d13ea1a22 Namespace and indentation emacs choices 2018-01-15 00:05:55 +00:00
8c675064bd Namespace and indentation 2018-01-15 00:04:43 +00:00
550b905bb8 Namespace nd indentation 2018-01-15 00:03:49 +00:00
edb79dc088 Namespce,and indent 2018-01-15 00:02:33 +00:00
88e635c5d1 Namepscae, format 2018-01-15 00:02:01 +00:00
ecb4a24de8 Namespace 2018-01-15 00:01:25 +00:00
c8c1d36710 Namespace, indent 2018-01-15 00:00:52 +00:00
b4bb428d9b Namespace, indent 2018-01-14 23:59:57 +00:00
e9ef7e3852 Namespace, indent 2018-01-14 23:59:23 +00:00
31cbbfc07e Namespace, indent 2018-01-14 23:58:44 +00:00
4eb0552d1d Namespace, indnet 2018-01-14 23:58:03 +00:00
08f2a4564f Namespace, formatting 2018-01-14 23:56:33 +00:00
7e00f643f8 Namespace indent 2018-01-14 23:55:44 +00:00
c19ccdad7c Namespace, indent 2018-01-14 23:55:07 +00:00
8aed4181e1 Namespace, indent 2018-01-14 23:54:25 +00:00
06ab7f5661 Namespace 2018-01-14 23:53:31 +00:00
645ec8eba0 Namespace 2018-01-14 23:52:26 +00:00
72ffa8a88e Namespace 2018-01-14 23:51:38 +00:00
4c829b410e Namespace 2018-01-14 23:50:20 +00:00
eda4fd9912 Namespace 2018-01-14 23:49:11 +00:00
041d9137c0 Namespace 2018-01-14 23:48:27 +00:00
eeacdfe031 Namespace 2018-01-14 23:47:37 +00:00
e5535f4d72 Namespace, indent 2018-01-14 23:46:51 +00:00
044a292281 Namespace, indnet 2018-01-14 23:46:07 +00:00
fe0467df1e Namespace, indenting 2018-01-14 23:45:19 +00:00
19234fb40e Namespace, format 2018-01-14 23:44:16 +00:00
f445257d28 Namespace, indenting 2018-01-14 23:43:36 +00:00
bdc2a987aa Namespace, indent 2018-01-14 23:42:47 +00:00
72acb0e48f Namespace, indent 2018-01-14 23:41:59 +00:00
b4e9211df7 Namespace, indent 2018-01-14 23:40:38 +00:00
97019d2997 Namespace, format 2018-01-14 23:39:57 +00:00
83c5f05094 Namespace, indent 2018-01-14 23:39:13 +00:00
1619e42d90 Indent and Namespace changes 2018-01-14 23:38:25 +00:00
9f6cebe5ff Namespace and format changes 2018-01-14 23:37:40 +00:00
a84ebe5624 Namespace, format change 2018-01-14 23:36:45 +00:00
c527e39881 Namespace, format indent change 2018-01-14 23:36:07 +00:00
a0f4687887 Namespace, formatting indent changes 2018-01-14 23:35:16 +00:00
3ef7b2389e Format eamcs style after NAMESPCCE change 2018-01-14 23:34:08 +00:00
7dfa3d0b50 Namespace, format 2018-01-14 23:33:16 +00:00
bf629dddce Namespace, format improved 2018-01-14 23:32:19 +00:00
7747b95430 Namespace, formatting emacs style 2018-01-14 23:31:28 +00:00
ccd75c039a Namespace, fmt 2018-01-14 23:30:34 +00:00
493ea80208 Namespace 2018-01-14 23:29:53 +00:00
229baf3aba Namespace, emacs fmt 2018-01-14 23:29:02 +00:00
0ce4ecfc84 Emacs format indent 2018-01-14 23:28:12 +00:00
ddfaae8ea6 Namespace 2018-01-14 23:27:49 +00:00
70c5b781e5 Namespace, clean up 2018-01-14 23:26:41 +00:00
901e359d28 Namespace changes; need to simplify the EOFA as too many cases and duplicated from Mobius 2018-01-14 23:25:51 +00:00
e857d4d4c8 Namespace, indent 2018-01-14 23:24:51 +00:00
e5b77c7fd8 Namespace, indent 2018-01-14 23:24:06 +00:00
3b5d629048 Namespace, format 2018-01-14 23:23:26 +00:00
08772d5e0c Namespace, indent 2018-01-14 23:22:42 +00:00
017dcd69a6 Namespace, indent 2018-01-14 23:21:40 +00:00
8178a17b88 Namespace, indent 2018-01-14 23:20:55 +00:00
c5c1b53e54 Namespace, indent 2018-01-14 23:20:08 +00:00
440f9e2013 Namespace, indent 2018-01-14 23:19:22 +00:00
c98657d588 Namespace 2018-01-14 23:18:46 +00:00
f450857716 Namespce, indent 2018-01-14 23:17:33 +00:00
9ec238df9e Namespace, indent 2018-01-14 23:16:49 +00:00
3ba8eb1500 Namespace, indent 2018-01-14 23:16:08 +00:00
8da49c5a34 Namespace 2018-01-14 23:15:26 +00:00
e04f61b1fa Namespace 2018-01-14 23:14:46 +00:00
115e13b227 Namespace 2018-01-14 23:13:49 +00:00
75f3062a80 Think this should move to the algorithms directory 2018-01-14 23:12:14 +00:00
b460cd3ef1 Namespace, format 2018-01-14 23:11:24 +00:00
0e6727a33b Namespace, format; possibly some conflict with Azusa beware 2018-01-14 23:10:21 +00:00
4c6745cb4c Namespace 2018-01-14 23:09:44 +00:00
efdd0e572c Namespace 2018-01-14 23:09:10 +00:00
ca60a218ac Namespace 2018-01-14 23:08:35 +00:00
03633d709e Namespace 2018-01-14 23:07:36 +00:00
4de58c4aab Namespace 2018-01-14 23:06:47 +00:00
4f8b1c1940 Namespace 2018-01-14 23:05:23 +00:00
dec39b313d Namespace and format 2018-01-14 23:04:37 +00:00
dc835ad1cb Namespace 2018-01-14 23:03:49 +00:00
71c8c9e4fb Pretty 2018-01-14 23:03:01 +00:00
a935ef7b39 Namespace 2018-01-14 23:01:07 +00:00
a97ad1a51d Namespce 2018-01-14 23:01:01 +00:00
5ab9129db3 Namespace 2018-01-14 22:58:42 +00:00
634943c11f Namepsace 2018-01-14 22:57:59 +00:00
e598e65f69 Namespace 2018-01-14 22:57:10 +00:00
291407dc7f Namespace 2018-01-14 22:54:42 +00:00
641a28aa1d Namespace 2018-01-14 22:53:50 +00:00
75207fa010 FOrmat 2018-01-14 22:53:13 +00:00
c2b0e0269a Namespace 2018-01-14 22:52:22 +00:00
7828887604 Namespace, indent 2018-01-14 22:51:18 +00:00
e6efc93a7c Namespace 2018-01-14 22:50:35 +00:00
ff7e773d5e Namesapce 2018-01-14 22:49:48 +00:00
a0380fad72 Namespace 2018-01-14 22:48:57 +00:00
61e9a33777 Namesapce 2018-01-14 22:48:08 +00:00
3e139b52d3 Namespace 2018-01-14 22:47:24 +00:00
fd6031b005 Namespace 2018-01-14 22:46:17 +00:00
fe44fc50d9 Namespace 2018-01-14 22:45:29 +00:00
2dd88cf3f8 Namespace 2018-01-14 22:44:41 +00:00
6b7e82f1a9 Namespace, indentation 2018-01-14 22:44:06 +00:00
be612b3931 Namespace, indentation 2018-01-14 22:43:27 +00:00
f5e74033f9 Namespace 2018-01-14 22:42:31 +00:00
8d52e0a349 Namespace 2018-01-14 22:41:23 +00:00
a60f6d353e Namespace 2018-01-14 22:40:29 +00:00
5d3b574325 Missing banner; should recreate globally 2018-01-14 22:39:24 +00:00
6ee5ea6b32 Namespace QCD gone 2018-01-14 22:38:22 +00:00
cc349c6512 Namespace 2018-01-14 22:36:59 +00:00
fde2e07bf4 Namespace 2018-01-14 22:36:15 +00:00
2f38fe8d45 Namespace 2018-01-14 22:35:24 +00:00
813af84ae8 Format emacs C++ mode 2018-01-14 22:34:12 +00:00
cfe6c6838f Namespace 2018-01-14 22:33:18 +00:00
12a7216dfe Namespace 2018-01-14 22:32:29 +00:00
71ebd61327 Namespace 2018-01-14 22:31:39 +00:00
2c2da60cc2 Namespace 2018-01-14 22:30:54 +00:00
7631ed9c56 Namespace 2018-01-14 22:30:09 +00:00
65669b116e Namespace 2018-01-14 22:29:18 +00:00
ae2a6cfc6e Namespace 2018-01-14 22:27:32 +00:00
c36223055e Namespace 2018-01-14 22:26:55 +00:00
e42de105c5 Namespace 2018-01-14 22:26:11 +00:00
b08dae0809 Namespace 2018-01-14 22:25:29 +00:00
3bf8fddbb5 Namespace 2018-01-14 22:24:47 +00:00
d29fa23ebc Namespace 2018-01-14 22:23:49 +00:00
c978c88521 Namespace 2018-01-14 22:22:27 +00:00
93f09818da Namespace 2018-01-14 22:21:40 +00:00
54a8ea93ec Namespace QCD gone 2018-01-14 22:20:42 +00:00
56e87d6e55 Namespace 2018-01-14 22:19:25 +00:00
df29cc19ab Namespace 2018-01-14 22:18:27 +00:00
e61189db3f Namespace 2018-01-14 22:17:43 +00:00
361ce948c3 Namespace 2018-01-14 22:16:33 +00:00
049b4a4631 Namespace 2018-01-14 22:15:55 +00:00
9f2f294a27 Namespace 2018-01-14 22:14:58 +00:00
81dcd0e6ea Namespace 2018-01-14 22:13:46 +00:00
34a788331f Namespace 2018-01-14 22:13:02 +00:00
e2c39945b3 Namespace 2018-01-14 22:11:03 +00:00
1591d391b9 Namespace 2018-01-14 22:09:42 +00:00
f4c06ed8c0 Namespace 2018-01-14 22:08:25 +00:00
1f49f781bf Namespace 2018-01-14 22:07:27 +00:00
3a9f746421 Namespace 2018-01-14 22:06:01 +00:00
4491d87766 Namespace 2018-01-14 22:04:21 +00:00
0e080a7abc Namespace 2018-01-14 22:03:14 +00:00
8bf78846ee Namespace 2018-01-14 22:02:09 +00:00
9aa34dc803 Namespace 2018-01-14 22:01:17 +00:00
fdcbe0a0d1 Namespace 2018-01-14 22:00:29 +00:00
6a62a9c6a5 Namespace 2018-01-14 21:59:48 +00:00
b331ecea78 Namespace 2018-01-14 21:58:47 +00:00
66f8a2f082 Namespace 2018-01-14 21:57:46 +00:00
d58b7cf9b9 Namespace changes 2018-01-14 21:56:55 +00:00
0d749becff Namespace 2018-01-14 21:55:47 +00:00
1dbea9aa69 Namespace 2018-01-14 21:54:28 +00:00
c1438cbbe3 Namespace 2018-01-14 21:53:39 +00:00
f4623fd551 Namespace 2018-01-14 21:53:05 +00:00
59ba9ff3bb NAMESPACE & format 2018-01-14 21:52:27 +00:00
1fbab4032b Namespace changes 2018-01-14 21:51:19 +00:00
c037244874 Tensor reformatted with NAMESPACE too 2018-01-13 00:31:02 +00:00
f4272aa6fd Clean up 2018-01-13 00:19:19 +00:00
8cb7a1a887 Format 2018-01-13 00:17:16 +00:00
b45bd8e097 NAMESPACE 2018-01-13 00:16:34 +00:00
5e48b701ec FOrmatting 2018-01-13 00:11:53 +00:00
7f6bffe5ad NAMESPACE 2018-01-13 00:11:30 +00:00
6bf5fb1924 Clean up and format NAMESPACE 2018-01-13 00:08:25 +00:00
086db7bd19 NAMESPACE and reformat 2018-01-13 00:05:33 +00:00
c0a9b38c02 C++ NAMESPACE format emacs happy 2018-01-13 00:03:57 +00:00
6d7bdfb5f5 Emacs happy 2018-01-13 00:02:53 +00:00
be5d70ae6e C++ happy 2018-01-13 00:02:10 +00:00
ab1068044e C++ emacs happy 2018-01-13 00:01:58 +00:00
dda151250f Emacs format 2018-01-12 23:59:58 +00:00
18daf85069 Emacs format 2018-01-12 23:58:23 +00:00
81cc28f6ca Format 2018-01-12 23:57:22 +00:00
c01a1e02fe Namespace, format 2018-01-12 23:55:38 +00:00
7e70f4ed9c Format, NAMESPACE 2018-01-12 23:55:03 +00:00
1056e36f11 Format, NAMESPACE 2018-01-12 23:49:46 +00:00
0b8a88978b Format, NAMESPACE 2018-01-12 23:47:24 +00:00
59b31b6bb8 Format, NAMESPACE 2018-01-12 23:43:44 +00:00
69496482fc Format, NAMESPACE 2018-01-12 23:42:22 +00:00
4be31ad1f6 C++ indentation 2018-01-12 23:39:49 +00:00
176a021ce9 Formatting, NAMESPACE§ 2018-01-12 23:38:15 +00:00
b673174b71 FOrmat, NAMESPACE 2018-01-12 23:29:22 +00:00
e6f7a5a818 Namespace 2018-01-12 23:28:01 +00:00
68b69a2ac0 Namespace management 2018-01-12 23:26:14 +00:00
bd15c38ae8 Formatting emacs compliant 2018-01-12 23:25:02 +00:00
b815f5f764 Formatting 2018-01-12 23:23:21 +00:00
4da437431e Reformat 2018-01-12 23:22:46 +00:00
3c7bf211a9 Reformat 2018-01-12 23:22:18 +00:00
347d5404dd format 2018-01-12 23:21:25 +00:00
5e2cd0d07c Format 2018-01-12 23:18:22 +00:00
62fcee72c5 Format, NAMESPACE 2018-01-12 23:16:37 +00:00
0a6168eef0 Format emacs style 2018-01-12 23:11:22 +00:00
63865e4232 format 2018-01-12 23:10:48 +00:00
c64deedf74 Format 2018-01-12 23:09:35 +00:00
3281559ec3 Format 2018-01-12 23:09:01 +00:00
6a2eca2ec2 NAMESAPCE 2018-01-12 23:00:03 +00:00
d8ff895e74 NAMESPACE and format 2018-01-12 18:27:22 +00:00
00c49d4c17 Format 2018-01-12 18:25:39 +00:00
ec89714cce NAMESPACE 2018-01-12 18:24:16 +00:00
6ab744c720 NAMESPACE and formatting 2018-01-12 18:11:04 +00:00
bbb657da5c NAMESPACE and formatting 2018-01-12 18:10:11 +00:00
fbc2380cb8 NAMESPACE & format 2018-01-12 18:05:36 +00:00
08682c5461 NAMESPACE and format to my liking 2018-01-12 18:03:57 +00:00
13bce2a6bf NAMESPACE 2018-01-12 17:58:53 +00:00
70e689900b NAMESPACE 2018-01-12 17:58:13 +00:00
10f7a17ae4 Make timing in VPGCR more detailed 2018-01-11 13:42:18 +01:00
26f14d7dd7 Adapt output format of non-herm solvers to the one of VPGCR 2018-01-11 13:36:30 +01:00
73434db636 Merge remote-tracking branch 'upstream/develop' into feature/ddalphaamg 2018-01-09 10:43:33 +01:00
c6411f8514 Merge remote-tracking branch 'upstream/develop' into feature/ddalphaamg 2018-01-08 10:37:10 +01:00
6cf635d61c Remove some old code in Wilson MG 2017-12-22 13:20:09 +01:00
39558cce52 Multiply TVs in Wilson MG with G5 instead of G5R5 2017-12-22 13:07:56 +01:00
df152648d6 Fix error in MR code when compiling for single precision 2017-12-06 18:00:58 +01:00
7a0c9e84f8 Fix HDF5 src 2017-11-29 18:03:03 -05:00
caf1a3c85d Also add HDF5 src 2017-11-29 17:58:02 -05:00
21ca730a49 Also import some dependcies 2017-11-29 17:34:40 -05:00
c6cd27e9b2 Also import Eigen.inc 2017-11-29 17:26:20 -05:00
6068411d61 Remove Eigen from gitignore 2017-11-29 17:24:40 -05:00
4e965c168e Implement analogon to test vector analysis in WMG codebase 2017-11-29 15:05:27 +01:00
f260af546e Save current state 2017-11-28 15:03:02 +01:00
649b8c9aca Save current state 2017-11-24 10:46:20 +01:00
0afa22747d Merge remote-tracking branch 'upstream/develop' into feature/new-solver-algorithms 2017-11-24 10:11:42 +01:00
fa43206c79 Remove some empty lines 2017-11-10 13:48:38 +01:00
a367835bf2 Set everything up for the implementation of FCAGMRES
The current implementation is the exact same code as normal FGMRES. This commit
only sets up the "framework" for the implementation of FCAGMRES, i.e., a test
and an include in the algorithms header file.
2017-11-09 17:30:41 +01:00
d7743591ea Fix some minor formatting errors 2017-11-09 17:28:19 +01:00
c6cbe533ea Set everything up for the implementation of CAGMRES
The current implementation is the exact same code as normal GMRES. This commit
only sets up the "framework" for the implementation of CAGMRES, i.e., a test and
an include in the algorithms header file.
2017-11-09 17:14:44 +01:00
8402ab6cf9 Some minor formatting improvements 2017-11-09 12:52:04 +01:00
c63095345e Remove some superfluous comments 2017-11-09 12:47:20 +01:00
a7ae46b61e Remove some comments 2017-11-08 16:58:20 +01:00
cd63052205 Remove everything preconditioner-related in GMRES code 2017-11-08 16:57:40 +01:00
699d537cd6 Add FGMRES test with staggered fermions 2017-11-08 16:56:42 +01:00
9031f0ed95 Fix a filename in a file header 2017-11-08 16:42:26 +01:00
26b3d441bb Check in forgotten FGMRES test with wilson Fermions 2017-11-08 16:39:11 +01:00
99bc4cde56 Fix an implementation error in FGMRES 2017-11-08 16:38:34 +01:00
e843d83d9d Make z in FGMRES a single Field 2017-11-08 16:38:16 +01:00
0f75ea52b7 First version of FGMRES; not working yet 2017-11-08 16:17:18 +01:00
8107b785cc Rename misunderstood "rsd_sq" to "rsq" in GMRES code 2017-11-08 14:40:03 +01:00
37b777d801 Add test for GMRES solver with staggered fermions 2017-11-08 14:28:48 +01:00
7382787856 Some minor changes 2017-11-08 14:23:55 +01:00
781c611ca0 Perform minor code style fix 2017-11-08 14:22:38 +01:00
b069090b52 Remove a superfluous comment 2017-11-08 13:58:02 +01:00
0c1c1d9900 Set precision and formatting only once in MR code 2017-11-08 13:57:06 +01:00
7f4ed6c2e5 First working version of GMRES + a test for Wilson fermions 2017-11-08 13:56:41 +01:00
56d32a4afb Rename misunderstood "rsd_sq" to "rsq" in MR code 2017-11-08 13:51:08 +01:00
b8ee496ed6 Print some info at start of GMRES 2017-11-08 13:23:41 +01:00
b87416dac4 Fix error with conformable 2017-11-07 15:00:08 +01:00
176bf37372 Remove some commented stuff 2017-11-07 14:57:36 +01:00
b3d342ca22 Remove old implementation of GMRES operator 2017-11-07 10:24:49 +01:00
e1f928398d Save current state 2017-11-07 10:22:41 +01:00
8c579d2d4a Save current state 2017-11-06 18:09:48 +01:00
fc7d07ade0 Correct function signature of body of GMRES outer loop 2017-11-06 17:12:38 +01:00
b3be9195b4 Save one lattice fermion in GMRES code 2017-11-06 17:12:23 +01:00
9e3c187a4d Save current state 2017-11-06 17:05:25 +01:00
8363edfcdb Perform some minor changes to GMRES code 2017-11-06 16:17:44 +01:00
74af31564f Adapt style of wilson GMRES test to style of wilson MR test 2017-11-06 14:06:45 +01:00
e0819d395f Merge remote-tracking branch 'upstream/develop' into feature/new-solver-algorithms 2017-11-06 13:09:36 +01:00
6f81906b00 Add test for the MR solver with staggered fermions; does not converge atm
TODO: Is this a property of staggered or did I do something wrong?
2017-10-30 16:57:55 +01:00
a2d83d4f3d Add test for the MR solver with DW fermions; does not converge atm
TODO: Is this a property of DWF or did I do something wrong?
2017-10-30 16:39:30 +01:00
89bacb0470 Fix path in MR solver header commentary 2017-10-30 16:16:55 +01:00
19010ff66a Merge remote-tracking branch 'upstream/develop' into feature/new-solver-algorithms 2017-10-30 13:16:46 +01:00
5a477ed29e Perform minor style correction 2017-10-27 14:46:18 +02:00
54128d579a Make MR a bit more verbose 2017-10-27 14:45:29 +02:00
e7b1933e88 Add a test for the MR solver 2017-10-27 14:38:57 +02:00
1bad64ac6a Some formatting 2017-10-27 14:35:04 +02:00
15dfa9f663 Change stopping criterion implementation in MR solver + some cleanup 2017-10-27 14:33:25 +02:00
2185b0d651 Correct author in the file 2017-10-27 14:32:38 +02:00
f61c0b5d03 Very early version of MR solver 2017-10-27 14:09:02 +02:00
074db32e54 Fix build of gmres test 2017-10-27 14:08:48 +02:00
d5f661ba70 Save intermediate state 2017-10-25 10:38:26 +02:00
1ab8d5cc13 Save two more files 2017-10-24 16:58:05 +02:00
789e892865 Save current state 2017-10-24 16:58:04 +02:00
53cfa44d7a Save current state 2017-10-24 16:58:03 +02:00
1257 changed files with 152315 additions and 98102 deletions

54
.github/ISSUE_TEMPLATE/bug-report.yml vendored Normal file
View File

@ -0,0 +1,54 @@
name: Bug report
description: Report a bug.
title: "<insert title>"
labels: [bug]
body:
- type: markdown
attributes:
value: >
Thank you for taking the time to file a bug report.
Please check that the code is pointing to the HEAD of develop
or any commit in master which is tagged with a version number.
- type: textarea
attributes:
label: "Describe the issue:"
description: >
Describe the issue and any previous attempt to solve it.
validations:
required: true
- type: textarea
attributes:
label: "Code example:"
description: >
If relevant, show how to reproduce the issue using a minimal working
example.
placeholder: |
<< your code here >>
render: shell
validations:
required: false
- type: textarea
attributes:
label: "Target platform:"
description: >
Give a description of the target platform (CPU, network, compiler).
Please give the full CPU part description, using for example
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
the `--version` option of your compiler.
validations:
required: true
- type: textarea
attributes:
label: "Configure options:"
description: >
Please give the exact configure command used and attach
`config.log`, `grid.config.summary` and the output of `make V=1`.
render: shell
validations:
required: true

6
.gitignore vendored
View File

@ -1,3 +1,7 @@
# Doxygen stuff
html/*
latex/*
# Compiled Object files #
#########################
*.slo
@ -88,6 +92,7 @@ Thumbs.db
# build directory #
###################
build*/*
Documentation/_build
# IDE related files #
#####################
@ -114,3 +119,4 @@ gh-pages/
#####################
Grid/qcd/spin/gamma-gen/*.h
Grid/qcd/spin/gamma-gen/*.cc
Grid/util/Version.h

View File

@ -1,61 +0,0 @@
language: cpp
cache:
directories:
- clang
matrix:
include:
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@ -0,0 +1,5 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@ -0,0 +1,5 @@
Version : 0.8.0
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
- MPI and MPI3 comms optimisations for KNL and OPA finished
- Half precision comms

View File

@ -30,8 +30,44 @@ directory
#ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H
//disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
//disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488
#endif
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
#endif
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC
#ifdef __ALTIVEC__
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __VSX__
#define EIGEN_DONT_VECTORIZE
#endif
#endif

View File

@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/Action.h>
#include <Grid/qcd/utils/GaugeFix.h>
#include <Grid/qcd/utils/CovariantSmearing.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/parallelIO/MetaData.h>
#include <Grid/qcd/hmc/HMC_aggregate.h>

View File

@ -38,16 +38,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_BASE_H
#define GRID_BASE_H
#include <Grid/GridStd.h>
#include <Grid/DisableWarnings.h>
#include <Grid/Namespace.h>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h>
//#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h>
#include <Grid/perfmon/Tracing.h>
#include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/threads/ThreadReduction.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h>
@ -55,7 +59,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice.h>
#include <Grid/cshift/Cshift.h>
#include <Grid/stencil/Stencil.h>
#include <Grid/stencil/GeneralLocalStencil.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)
#endif

View File

@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);
#endif

View File

@ -6,7 +6,9 @@
///////////////////
#include <cassert>
#include <complex>
#include <memory>
#include <vector>
#include <array>
#include <string>
#include <iostream>
#include <iomanip>
@ -14,6 +16,7 @@
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>
@ -26,4 +29,7 @@
///////////////////
#include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */

View File

@ -1,14 +1,75 @@
#include <Grid/GridCore.h>
#pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL
#define EIGEN_USE_MKL_ALL
#endif
#if defined __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
#endif
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__
#undef __CUDACC__
#define __NVCC__REDEFINE__
#endif
/* SYCL save and restore compile environment*/
#ifdef GRID_SYCL
#pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
/* NVCC restore */
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop
#endif
/*SYCL restore*/
#ifdef __SYCL__REDEFINE__
#pragma pop_macro("__SYCL_DEVICE_ONLY__")
#pragma pop
#endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__
#pragma GCC diagnostic pop
#endif

1
Grid/Grid_Eigen_Tensor.h Normal file
View File

@ -0,0 +1 @@
#include <Grid/Grid_Eigen_Dense.h>

View File

@ -21,7 +21,8 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h
endif
all: version-cache
all: version-cache Version.h
version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@ -42,7 +43,7 @@ version-cache:
fi;\
rm -f vertmp
Version.h:
Version.h: version-cache
cp version-cache Version.h
.PHONY: version-cache
@ -53,6 +54,23 @@ Version.h:
include Make.inc
include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
if BUILD_SP
extra_sources+=$(SP_FERMION_FILES)
extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources)

43
Grid/Namespace.h Normal file
View File

@ -0,0 +1,43 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Namespace.h
Copyright (C) 2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <type_traits>
#include <cassert>
#include <exception>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@ -29,33 +29,56 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H
NAMESPACE_CHECK(blas);
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/JacobiPolynomial.h>
#include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
#include <Grid/algorithms/iterative/MinimalResidual.h>
#include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/CoarsenedMatrix.h>
#include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/iterative/AdefGeneric.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/multigrid/MultiGrid.h>
NAMESPACE_CHECK(multigrid);
#include <Grid/algorithms/FFT.h>
// EigCg
// Pcg
// Hdcg
// GCR
// etc..
#endif

View File

@ -1,480 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/CoarsenedMatrix.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H
namespace Grid {
class Geometry {
// int dimension;
public:
int npoint;
std::vector<int> directions ;
std::vector<int> displacements;
Geometry(int _d) {
int base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
for(int d=0;d<_d;d++){
directions[2*d ] = d+base;
directions[2*d+1] = d+base;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*_d]=0;
displacements[2*_d]=0;
//// report back
std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout <<std::endl;
std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl;
}
/*
// Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){
directions[2*d ] = d;
directions[2*d+1] = d;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*dimension]=0;
displacements[2*dimension]=0;
}
std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point];
return delta;
};
*/
};
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
}
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=zero;
parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
eProj._odata[ss](i)=CComplex(1.0);
}
eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.checkerboard = subspace[0].checkerboard;
blockPromote(CoarseVec,FineVec,subspace);
}
void CreateSubspaceRandom(GridParallelRNG &RNG){
for(int i=0;i<nbasis;i++){
random(RNG,subspace[i]);
std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
}
Orthogonalise();
}
/*
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
// Run a Lanczos with sloppy convergence
const int Nstop = nn;
const int Nk = nn+20;
const int Np = nn+20;
const int Nm = Nk+Np;
const int MaxIt= 10000;
RealD resid = 1.0e-3;
Chebyshev<FineField> Cheb(0.5,64.0,21);
ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
// IRL.lock = 1;
FineField noise(FineGrid); gaussian(RNG,noise);
FineField tmp(FineGrid);
std::vector<RealD> eval(Nm);
std::vector<FineField> evec(Nm,FineGrid);
int Nconv;
IRL.calc(eval,evec,
noise,
Nconv);
// pull back nn vectors
for(int b=0;b<nn;b++){
subspace[b] = evec[b];
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
noise = tmp - sqrt(eval[b])*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
noise = tmp + eval[b]*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
}
Orthogonalise();
for(int b=0;b<nn;b++){
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
}
}
*/
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,10000);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
Orthogonalise();
}
};
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
////////////////////
// Data members
////////////////////
Geometry geom;
GridBase * _grid;
CartesianStencil<siteVector,siteVector> Stencil;
std::vector<CoarseMatrix> A;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
RealD M (const CoarseVector &in, CoarseVector &out){
conformable(_grid,in._grid);
conformable(in._grid,out._grid);
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
siteVector res = zero;
siteVector nbr;
int ptype;
StencilEntry *SE;
for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in._odata[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in._odata[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
res = res + A[point]._odata[ss]*nbr;
}
vstream(out._odata[ss],res);
}
return norm2(out);
};
RealD Mdag (const CoarseVector &in, CoarseVector &out){
return M(in,out);
};
// Defer support for further coarsening for now
void Mdiag (const CoarseVector &in, CoarseVector &out){};
void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp){};
CoarsenedMatrix(GridCartesian &CoarseGrid) :
_grid(&CoarseGrid),
geom(CoarseGrid._ndimension),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid)
{
};
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace){
FineField iblock(FineGrid); // contributions from within this block
FineField oblock(FineGrid); // contributions from outwith this block
FineField phi(FineGrid);
FineField tmp(FineGrid);
FineField zz(FineGrid); zz=zero;
FineField Mphi(FineGrid);
Lattice<iScalar<vInteger> > coor(FineGrid);
CoarseVector iProj(Grid());
CoarseVector oProj(Grid());
CoarseScalar InnerProd(Grid());
// Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace);
// Compute the matrix elements of linop between this orthonormal
// set of vectors.
int self_stencil=-1;
for(int p=0;p<geom.npoint;p++){
A[p]=zero;
if( geom.displacements[p]==0){
self_stencil=p;
}
}
assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){
phi=Subspace.subspace[i];
std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
for(int p=0;p<geom.npoint;p++){
int dir = geom.directions[p];
int disp = geom.displacements[p];
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
LatticeCoordinate(coor,dir);
if ( disp==0 ){
linop.OpDiag(phi,Mphi);
}
else {
linop.OpDir(phi,Mphi,dir,disp);
}
////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell
////////////////////////////////////////////////////////////////////////
if ( disp==0 ) {
iblock = Mphi;
oblock = zero;
} else if ( disp==1 ) {
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
} else if ( disp==-1 ) {
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
} else {
assert(0);
}
Subspace.ProjectToSubspace(iProj,iblock);
Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace);
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
for(int j=0;j<nbasis;j++){
if( disp!= 0 ) {
A[p]._odata[ss](j,i) = oProj._odata[ss](j);
}
A[self_stencil]._odata[ss](j,i) = A[self_stencil]._odata[ss](j,i) + iProj._odata[ss](j);
}
}
}
}
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl;
}
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
// ForceHermitian();
AssertHermitian();
// ForceDiagonal();
}
void ForceDiagonal(void) {
std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
std::cout<<GridLogMessage<<"**** Forcing coarse operator to be diagonal ****"<<std::endl;
std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
for(int p=0;p<8;p++){
A[p]=zero;
}
GridParallelRNG RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
Complex one(1.0);
iMatrix<CComplex,nbasis> ident; ident=one;
val = val*adj(val);
val = val + 1.0;
A[8] = val*ident;
// for(int s=0;s<Grid()->oSites();s++) {
// A[8]._odata[s]=val._odata[s];
// }
}
void ForceHermitian(void) {
for(int d=0;d<4;d++){
int dd=d+1;
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
}
// A[8] = 0.5*(A[8] + adj(A[8]));
}
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d];
Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
};
}
#endif

View File

@ -1,5 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,78 +23,77 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_FFT_H_
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#ifdef USE_MKL
#if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h>
#else
#include <fftw3.h>
#endif
#endif
NAMESPACE_BEGIN(Grid);
namespace Grid {
template<class scalar> struct FFTW { };
template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
template<> struct FFTW<ComplexF> {
public:
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
#endif
@ -104,203 +102,202 @@ namespace Grid {
#define FFTW_BACKWARD (+1)
#endif
class FFT {
private:
class FFT {
private:
GridCartesian *vgrid;
GridCartesian *sgrid;
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops;
double flops_call;
uint64_t usec;
int Nd;
double flops;
double flops_call;
uint64_t usec;
std::vector<int> dimensions;
std::vector<int> processors;
std::vector<int> processor_coor;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public:
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) :
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{
flops=0;
usec =0;
std::vector<int> layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
};
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
std::vector<int> mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
}
}
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
}
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
PARALLEL_REGION
{
std::vector<int> clbuf(Nd), cgbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
}
}
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
{
flops=0;
usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
};
}
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl;
assert(0);
#else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
{
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
pokeLocalSite(s,p_v,cbuf);
});
}
if (p != processors[dim] - 1) {
result = Cshift(result,dim,L);
}
}
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result_v,clbuf);
});
}
result = result*div;
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
};
NAMESPACE_END(Grid);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,24 +23,30 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0;
};
template<class Field> using Preconditioner = LinearFunction<Field> ;
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};
};
/*
template<class Field> class Preconditioner : public LinearFunction<Field> {
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field & psi)=0;
};
*/
}
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
using Preconditioner<Field>::operator();
virtual void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};
};
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,57 +23,64 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_SPARSE_MATRIX_H
#define GRID_ALGORITHM_SPARSE_MATRIX_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase {
public:
virtual GridBase *Grid(void) =0;
// Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in._grid);
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase {
public:
virtual GridBase *Grid(void) =0;
// Full checkerboar operations
virtual void M (const Field &in, Field &out)=0;
virtual void Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out) {
Field tmp (in.Grid());
M(in,tmp);
Mdag(tmp,out);
}
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
virtual ~SparseMatrixBase() {};
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public:
virtual GridBase *RedBlackGrid(void)=0;
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public:
virtual GridBase *RedBlackGrid(void)=0;
//////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions
//////////////////////////////////////////////////////////////////////
virtual RealD Mass(void) { return 0.0; };
virtual int ConstEE(void) { return 0; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
//////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions
//////////////////////////////////////////////////////////////////////
virtual RealD Mass(void) { return 0.0; };
virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
// half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0;
// half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual ~CheckerBoardedSparseMatrixBase() {};
};
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -25,14 +25,14 @@ Author: Christoph Lehner <clehner@bnl.gov>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CHEBYSHEV_H
#define GRID_CHEBYSHEV_H
#include <Grid/algorithms/LinearOperator.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
@ -41,337 +41,374 @@ struct ChebyParams : Serializable {
int, Npoly);
};
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD hi;
RealD lo;
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
RealD delta = (hi-lo)*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
std::vector<RealD> Coeffs;
int order;
RealD hi;
RealD lo;
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.02;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
// std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order,0.0);
Coeffs[order-1] = 1.0;
};
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
// Similar kick effect below the threshold as Lanczos filter approach
void InitLowPass(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD k=(order-1.0);
RealD s=std::cos( j*M_PI*(k+0.5)/order );
Coeffs[j] = s * 2.0/order;
}
};
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
typedef typename Field::vector_type vector_type;
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
axpby(T1,xscale,mscale,y,in);
// sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1;
axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
alpha(_alpha),
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in._grid;
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
}
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in.Grid();
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -26,127 +26,127 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = Zero(); return chi; }
else if(degree == 1){ return prev_solns[0]; }
// RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = conjugate(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = Zero();
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = conjugate(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; }
RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = zero;
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,129 @@
#ifndef GRID_JACOBIPOLYNOMIAL_H
#define GRID_JACOBIPOLYNOMIAL_H
#include <Grid/algorithms/LinearOperator.h>
NAMESPACE_BEGIN(Grid);
template<class Field>
class JacobiPolynomial : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
int order;
RealD hi;
RealD lo;
RealD alpha;
RealD beta;
public:
void csv(std::ostream &out){
csv(out,lo,hi);
}
void csv(std::ostream &out,RealD llo,RealD hhi){
RealD diff = hhi-llo;
RealD delta = diff*1.0e-5;
for (RealD x=llo-delta; x<=hhi; x+=delta) {
RealD f = approx(x);
out<< x<<" "<<f <<std::endl;
}
return;
}
JacobiPolynomial(){};
JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
{
lo=_lo;
hi=_hi;
alpha=_alpha;
beta=_beta;
order=_order;
};
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1.0;
RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
Tn =T1;
Tnm=T0;
for(int n=2;n<=order;n++){
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
Tnm=Tn;
Tn =Tnp;
}
return Tnp;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid);
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// RealD T0=1.0;
T0=in;
// RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
// = x * 2/(hi-lo) - (hi+lo)/(hi-lo)
Linop.HermOp(T0,y);
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
y=y*xscale+in*mscale;
// RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
RealD halfAmB = (alpha-beta)*0.5;
RealD halfApBp2= (alpha+beta+2.0)*0.5;
T1 = halfAmB * in + halfApBp2*y;
for(int n=2;n<=order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
// Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
cny=cny/cnp;
cn1=cn1/cnp;
cn1=cn1/cnp;
cnm=cnm/cnp;
*Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
out=*Tnp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -27,7 +27,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
double MultiShiftFunction::approx(double x)
{
double a = norm;
@ -53,4 +54,4 @@ void MultiShiftFunction::csv(std::ostream &out)
}
return;
}
}
NAMESPACE_END(Grid);

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION
namespace Grid {
NAMESPACE_BEGIN(Grid);
class MultiShiftFunction {
public:
@ -40,7 +40,7 @@ public:
RealD norm;
RealD lo,hi;
MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
RealD approx(RealD x);
void csv(std::ostream &out);
void gnuplot(std::ostream &out);
@ -63,5 +63,5 @@ public:
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
// Search for error maxima and minima
void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, j, meq, emsign, ensign, steps;
int i, meq, emsign, ensign, steps;
meq = neq + 1;
bigfloat *yy = new bigfloat[meq];
@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) {
bigfloat eclose = 1.0e30;
bigfloat farther = 0l;
j = 1;
xx0 = apstrt;
for (i = 0; i < meq; i++) {

View File

@ -0,0 +1,473 @@
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
#include<string>
#include<iostream>
#include<iomanip>
#include<cassert>
#include<Grid/algorithms/approx/RemezGeneral.h>
// Constructor
AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision,
bigfloat (*f)(bigfloat x, void *data), void *data): f(f),
data(data),
prec(precision),
apstrt(lower), apend(upper), apwidt(upper - lower),
n(0), d(0), pow_n(0), pow_d(0)
{
bigfloat::setDefaultPrecision(prec);
std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
}
//Determine the properties of the numerator and denominator polynomials
void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in){
pow_n = num_degree;
pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
num_type = num_type_in;
den_type = den_type_in;
num_pows.resize(pow_n+1);
den_pows.resize(pow_d+1);
int n_in = 0;
bool odd = num_type == PolyType::Full || num_type == PolyType::Odd;
bool even = num_type == PolyType::Full || num_type == PolyType::Even;
for(int i=0;i<=pow_n;i++){
num_pows[i] = -1;
if(i % 2 == 0 && even) num_pows[i] = n_in++;
if(i % 2 == 1 && odd) num_pows[i] = n_in++;
}
std::cout << n_in << " terms in numerator" << std::endl;
--n_in; //power is 1 less than the number of terms, eg pow=1 a x^1 + b x^0
int d_in = 0;
odd = den_type == PolyType::Full || den_type == PolyType::Odd;
even = den_type == PolyType::Full || den_type == PolyType::Even;
for(int i=0;i<=pow_d;i++){
den_pows[i] = -1;
if(i % 2 == 0 && even) den_pows[i] = d_in++;
if(i % 2 == 1 && odd) den_pows[i] = d_in++;
}
std::cout << d_in << " terms in denominator" << std::endl;
--d_in;
n = n_in;
d = d_in;
}
//Setup algorithm
void AlgRemezGeneral::reinitializeAlgorithm(){
spread = 1.0e37;
iter = 0;
neq = n + d + 1; //not +2 because highest-power term in denominator is fixed to 1
param.resize(neq);
yy.resize(neq+1);
//Initialize linear equation temporaries
A.resize(neq*neq);
B.resize(neq);
IPS.resize(neq);
//Initialize maximum and minimum errors
xx.resize(neq+2);
mm.resize(neq+1);
initialGuess();
//Initialize search steps
step.resize(neq+1);
stpini();
}
double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degree,
const PolyType num_type_in, const PolyType den_type_in,
const double _tolerance, const int report_freq){
//Setup the properties of the polynomial
setupPolyProperties(num_degree, den_degree, num_type_in, den_type_in);
//Setup the algorithm
reinitializeAlgorithm();
bigfloat tolerance = _tolerance;
//Iterate until convergance
while (spread > tolerance) {
if (iter++ % report_freq==0)
std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta << std::endl;
equations();
if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
assert(0);
};
assert( delta>= tolerance );
search();
}
int sign;
double error = (double)getErr(mm[0],&sign);
std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
// Return the maximum error in the approximation
return error;
}
// Initial values of maximal and minimal errors
void AlgRemezGeneral::initialGuess(){
// Supply initial guesses for solution points
long ncheb = neq; // Degree of Chebyshev error estimate
// Find ncheb+1 extrema of Chebyshev polynomial
bigfloat a = ncheb;
bigfloat r;
mm[0] = apstrt;
for (long i = 1; i < ncheb; i++) {
r = 0.5 * (1 - cos((M_PI * i)/(double) a));
//r *= sqrt_bf(r);
r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
mm[i] = apstrt + r * apwidt;
}
mm[ncheb] = apend;
a = 2.0 * ncheb;
for (long i = 0; i <= ncheb; i++) {
r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
//r *= sqrt_bf(r); // Squeeze to low end of interval
r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
xx[i] = apstrt + r * apwidt;
}
}
// Initialise step sizes
void AlgRemezGeneral::stpini(){
xx[neq+1] = apend;
delta = 0.25;
step[0] = xx[0] - apstrt;
for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
step[neq] = step[neq-1];
}
// Search for error maxima and minima
void AlgRemezGeneral::search(){
bigfloat a, q, xm, ym, xn, yn, xx1;
int emsign, ensign, steps;
int meq = neq + 1;
bigfloat eclose = 1.0e30;
bigfloat farther = 0l;
bigfloat xx0 = apstrt;
for (int i = 0; i < meq; i++) {
steps = 0;
xx1 = xx[i]; // Next zero
if (i == meq-1) xx1 = apend;
xm = mm[i];
ym = getErr(xm,&emsign);
q = step[i];
xn = xm + q;
if (xn < xx0 || xn >= xx1) { // Cannot skip over adjacent boundaries
q = -q;
xn = xm;
yn = ym;
ensign = emsign;
} else {
yn = getErr(xn,&ensign);
if (yn < ym) {
q = -q;
xn = xm;
yn = ym;
ensign = emsign;
}
}
while(yn >= ym) { // March until error becomes smaller.
if (++steps > 10)
break;
ym = yn;
xm = xn;
emsign = ensign;
a = xm + q;
if (a == xm || a <= xx0 || a >= xx1)
break;// Must not skip over the zeros either side.
xn = a;
yn = getErr(xn,&ensign);
}
mm[i] = xm; // Position of maximum
yy[i] = ym; // Value of maximum
if (eclose > ym) eclose = ym;
if (farther < ym) farther = ym;
xx0 = xx1; // Walk to next zero.
} // end of search loop
q = (farther - eclose); // Decrease step size if error spread increased
if (eclose != 0.0) q /= eclose; // Relative error spread
if (q >= spread)
delta *= 0.5; // Spread is increasing; decrease step size
spread = q;
for (int i = 0; i < neq; i++) {
q = yy[i+1];
if (q != 0.0) q = yy[i] / q - (bigfloat)1l;
else q = 0.0625;
if (q > (bigfloat)0.25) q = 0.25;
q *= mm[i+1] - mm[i];
step[i] = q * delta;
}
step[neq] = step[neq-1];
for (int i = 0; i < neq; i++) { // Insert new locations for the zeros.
xm = xx[i] - step[i];
if (xm <= apstrt)
continue;
if (xm >= apend)
continue;
if (xm <= mm[i])
xm = (bigfloat)0.5 * (mm[i] + xx[i]);
if (xm >= mm[i+1])
xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
xx[i] = xm;
}
}
// Solve the equations
void AlgRemezGeneral::equations(){
bigfloat x, y, z;
bigfloat *aa;
for (int i = 0; i < neq; i++) { // set up the equations for solution by simq()
int ip = neq * i; // offset to 1st element of this row of matrix
x = xx[i]; // the guess for this row
y = func(x); // right-hand-side vector
z = (bigfloat)1l;
aa = A.data()+ip;
int t = 0;
for (int j = 0; j <= pow_n; j++) {
if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x;
}
assert(t == n+1);
z = (bigfloat)1l;
t = 0;
for (int j = 0; j < pow_d; j++) {
if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x;
}
assert(t == d);
B[i] = y * z; // Right hand side vector
}
// Solve the simultaneous linear equations.
if (simq()){
std::cout<<"simq failed\n";
exit(0);
}
}
// Evaluate the rational form P(x)/Q(x) using coefficients
// from the solution vector param
bigfloat AlgRemezGeneral::approx(const bigfloat x) const{
// Work backwards toward the constant term.
int c = n;
bigfloat yn = param[c--]; // Highest order numerator coefficient
for (int i = pow_n-1; i >= 0; i--) yn = x * yn + (num_pows[i] != -1 ? param[c--] : bigfloat(0l));
c = n+d;
bigfloat yd = 1l; //Highest degree coefficient is 1.0
for (int i = pow_d-1; i >= 0; i--) yd = x * yd + (den_pows[i] != -1 ? param[c--] : bigfloat(0l));
return(yn/yd);
}
// Compute size and sign of the approximation error at x
bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{
bigfloat f = func(x);
bigfloat e = approx(x) - f;
if (f != 0) e /= f;
if (e < (bigfloat)0.0) {
*sign = -1;
e = -e;
}
else *sign = 1;
return(e);
}
// Solve the system AX=B
int AlgRemezGeneral::simq(){
int ip, ipj, ipk, ipn;
int idxpiv;
int kp, kp1, kpk, kpn;
int nip, nkp;
bigfloat em, q, rownrm, big, size, pivot, sum;
bigfloat *aa;
bigfloat *X = param.data();
int n = neq;
int nm1 = n - 1;
// Initialize IPS and X
int ij = 0;
for (int i = 0; i < n; i++) {
IPS[i] = i;
rownrm = 0.0;
for(int j = 0; j < n; j++) {
q = abs_bf(A[ij]);
if(rownrm < q) rownrm = q;
++ij;
}
if (rownrm == (bigfloat)0l) {
std::cout<<"simq rownrm=0\n";
return(1);
}
X[i] = (bigfloat)1.0 / rownrm;
}
for (int k = 0; k < nm1; k++) {
big = 0.0;
idxpiv = 0;
for (int i = k; i < n; i++) {
ip = IPS[i];
ipk = n*ip + k;
size = abs_bf(A[ipk]) * X[ip];
if (size > big) {
big = size;
idxpiv = i;
}
}
if (big == (bigfloat)0l) {
std::cout<<"simq big=0\n";
return(2);
}
if (idxpiv != k) {
int j = IPS[k];
IPS[k] = IPS[idxpiv];
IPS[idxpiv] = j;
}
kp = IPS[k];
kpk = n*kp + k;
pivot = A[kpk];
kp1 = k+1;
for (int i = kp1; i < n; i++) {
ip = IPS[i];
ipk = n*ip + k;
em = -A[ipk] / pivot;
A[ipk] = -em;
nip = n*ip;
nkp = n*kp;
aa = A.data()+nkp+kp1;
for (int j = kp1; j < n; j++) {
ipj = nip + j;
A[ipj] = A[ipj] + em * *aa++;
}
}
}
kpn = n * IPS[n-1] + n - 1; // last element of IPS[n] th row
if (A[kpn] == (bigfloat)0l) {
std::cout<<"simq A[kpn]=0\n";
return(3);
}
ip = IPS[0];
X[0] = B[ip];
for (int i = 1; i < n; i++) {
ip = IPS[i];
ipj = n * ip;
sum = 0.0;
for (int j = 0; j < i; j++) {
sum += A[ipj] * X[j];
++ipj;
}
X[i] = B[ip] - sum;
}
ipn = n * IPS[n-1] + n - 1;
X[n-1] = X[n-1] / A[ipn];
for (int iback = 1; iback < n; iback++) {
//i goes (n-1),...,1
int i = nm1 - iback;
ip = IPS[i];
nip = n*ip;
sum = 0.0;
aa = A.data()+nip+i+1;
for (int j= i + 1; j < n; j++)
sum += *aa++ * X[j];
X[i] = (X[i] - sum) / A[nip+i];
}
return(0);
}
void AlgRemezGeneral::csv(std::ostream & os) const{
os << "Numerator" << std::endl;
for(int i=0;i<=pow_n;i++){
os << getCoeffNum(i) << "*x^" << i;
if(i!=pow_n) os << " + ";
}
os << std::endl;
os << "Denominator" << std::endl;
for(int i=0;i<=pow_d;i++){
os << getCoeffDen(i) << "*x^" << i;
if(i!=pow_d) os << " + ";
}
os << std::endl;
//For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc
int sign;
os << "Errors at maxima: coordinate, error, (sign)" << std::endl;
for(int i=0;i<neq+1;i++){
os << mm[i] << " " << getErr(mm[i],&sign) << " (" << sign << ")" << std::endl;
}
os << "Scan over range:" << std::endl;
int npt = 60;
bigfloat dlt = (apend - apstrt)/bigfloat(npt-1);
for (bigfloat x=apstrt; x<=apend; x = x + dlt) {
double f = evaluateFunc(x);
double r = evaluateApprox(x);
os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
}
return;
}

View File

@ -0,0 +1,170 @@
/*
C.Kelly Jan 2020 based on implementation by M. Clark May 2005
AlgRemezGeneral is an implementation of the Remez algorithm for approximating an arbitrary function by a rational polynomial
It includes optional restriction to odd/even polynomials for the numerator and/or denominator
*/
#ifndef INCLUDED_ALG_REMEZ_GENERAL_H
#define INCLUDED_ALG_REMEZ_GENERAL_H
#include <stddef.h>
#include <Grid/GridStd.h>
#ifdef HAVE_LIBGMP
#include "bigfloat.h"
#else
#include "bigfloat_double.h"
#endif
class AlgRemezGeneral{
public:
enum PolyType { Even, Odd, Full };
private:
// In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer
bigfloat (*f)(bigfloat x, void *data);
void *data;
// The approximation parameters
std::vector<bigfloat> param;
bigfloat norm;
// The number of non-zero terms in the numerator and denominator
int n, d;
// The numerator and denominator degree (i.e. the largest power)
int pow_n, pow_d;
// Specify if the numerator and/or denominator are odd/even polynomials
PolyType num_type;
PolyType den_type;
std::vector<int> num_pows; //contains the mapping, with -1 if not present
std::vector<int> den_pows;
// The bounds of the approximation
bigfloat apstrt, apwidt, apend;
// Variables used to calculate the approximation
int nd1, iter;
std::vector<bigfloat> xx;
std::vector<bigfloat> mm;
std::vector<bigfloat> step;
bigfloat delta, spread;
// Variables used in search
std::vector<bigfloat> yy;
// Variables used in solving linear equations
std::vector<bigfloat> A;
std::vector<bigfloat> B;
std::vector<int> IPS;
// The number of equations we must solve at each iteration (n+d+1)
int neq;
// The precision of the GNU MP library
long prec;
// Initialize member variables associated with the polynomial's properties
void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in);
// Initial values of maximal and minmal errors
void initialGuess();
// Initialise step sizes
void stpini();
// Initialize the algorithm
void reinitializeAlgorithm();
// Solve the equations
void equations();
// Search for error maxima and minima
void search();
// Calculate function required for the approximation
inline bigfloat func(bigfloat x) const{
return f(x, data);
}
// Compute size and sign of the approximation error at x
bigfloat getErr(bigfloat x, int *sign) const;
// Solve the system AX=B where X = param
int simq();
// Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param
bigfloat approx(bigfloat x) const;
public:
AlgRemezGeneral(double lower, double upper, long prec,
bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{
assert(n==d);
return n;
}
// Reset the bounds of the approximation
inline void setBounds(double lower, double upper) {
apstrt = lower;
apend = upper;
apwidt = apend - apstrt;
}
// Get the bounds of the approximation
inline void getBounds(double &lower, double &upper) const{
lower=(double)apstrt;
upper=(double)apend;
}
// Run the algorithm to generate the rational approximation
double generateApprox(int num_degree, int den_degree,
PolyType num_type, PolyType den_type,
const double tolerance = 1e-15, const int report_freq = 1000);
inline double generateApprox(int num_degree, int den_degree,
const double tolerance = 1e-15, const int report_freq = 1000){
return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq);
}
// Evaluate the rational form P(x)/Q(x) using coefficients from the
// solution vector param
inline double evaluateApprox(double x) const{
return (double)approx((bigfloat)x);
}
// Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param
inline double evaluateInverseApprox(double x) const{
return 1.0/(double)approx((bigfloat)x);
}
// Calculate function required for the approximation
inline double evaluateFunc(double x) const{
return (double)func((bigfloat)x);
}
// Calculate inverse function required for the approximation
inline double evaluateInverseFunc(double x) const{
return 1.0/(double)func((bigfloat)x);
}
// Dump csv of function, approx and error
void csv(std::ostream &os = std::cout) const;
// Get the coefficient of the term x^i in the numerator
inline double getCoeffNum(const int i) const{
return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]);
}
// Get the coefficient of the term x^i in the denominator
inline double getCoeffDen(const int i) const{
if(i == pow_d) return 1.0;
else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]);
}
};
#endif

View File

@ -0,0 +1,183 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/ZMobius.cc
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/algorithms/approx/ZMobius.h>
#include <Grid/algorithms/approx/RemezGeneral.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
//Compute the tanh approximation
inline double epsilonMobius(const double x, const std::vector<ComplexD> &w){
int Ls = w.size();
ComplexD fxp = 1., fmp = 1.;
for(int i=0;i<Ls;i++){
fxp = fxp * ( w[i] + x );
fmp = fmp * ( w[i] - x );
}
return ((fxp - fmp)/(fxp + fmp)).real();
}
inline double epsilonMobius(const double x, const std::vector<RealD> &w){
int Ls = w.size();
double fxp = 1., fmp = 1.;
for(int i=0;i<Ls;i++){
fxp = fxp * ( w[i] + x );
fmp = fmp * ( w[i] - x );
}
return (fxp - fmp)/(fxp + fmp);
}
//Compute the tanh approximation in a form suitable for the Remez
bigfloat epsilonMobius(bigfloat x, void* data){
const std::vector<RealD> &omega = *( (std::vector<RealD> const*)data );
bigfloat fxp(1.0);
bigfloat fmp(1.0);
for(int i=0;i<omega.size();i++){
fxp = fxp * ( bigfloat(omega[i]) + x);
fmp = fmp * ( bigfloat(omega[i]) - x);
}
return (fxp - fmp)/(fxp + fmp);
}
//Compute the Zmobius Omega parameters suitable for eigenvalue range -lambda_bound <= lambda <= lambda_bound
//Note omega_i = 1/(b_i + c_i) where b_i and c_i are the Mobius parameters
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){
assert(omega_in.size() == Ls_in);
omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial
//For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6)
AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in);
remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100);
remez.csv(std::cout);
//The rational approximation has the form [ f(x) - f(-x) ] / [ f(x) + f(-x) ] where f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x )
//cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7 pg 102
//omega_i are therefore the negative of the complex roots of f(x)
//We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial
// \rho(\lambda) = det( A - \lambda I ) where I is the unit matrix
//The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n is the companion matrix
// A = | 0 1 0 0 0 .... 0 |
// | 0 0 1 0 0 .... 0 |
// | : : : : : : |
// | 0 0 0 0 0 1
// | -a0 -a1 -a2 ... ... -an|
//Note the Remez defines the largest power to have unit coefficient
std::vector<RealD> coeffs(Ls_out+1);
for(int i=0;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffDen(i); //even powers
for(int i=1;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffNum(i); //odd powers
std::vector<std::complex<RealD> > roots(Ls_out);
//Form the companion matrix
Eigen::MatrixXd compn(Ls_out,Ls_out);
for(int i=0;i<Ls_out-1;i++) compn(i,0) = 0.;
compn(Ls_out - 1, 0) = -coeffs[0];
for(int j=1;j<Ls_out;j++){
for(int i=0;i<Ls_out-1;i++) compn(i,j) = i == j-1 ? 1. : 0.;
compn(Ls_out - 1, j) = -coeffs[j];
}
//Eigensolve
Eigen::EigenSolver<Eigen::MatrixXd> slv(compn, false);
const auto & ev = slv.eigenvalues();
for(int i=0;i<Ls_out;i++)
omega_out[i] = -ev(i);
//Sort ascending (smallest at start of vector!)
std::sort(omega_out.begin(), omega_out.end(),
[&](const ComplexD &a, const ComplexD &b){ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag()); });
//McGlynn thesis pg 122 suggest improved iteration counts if magnitude of omega diminishes towards the center of the 5th dimension
std::vector<ComplexD> omega_tmp = omega_out;
int s_low=0, s_high=Ls_out-1, ss=0;
for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega
int s_to;
if(ss % 2 == 0){
s_to = s_low++;
}else{
s_to = s_high--;
}
omega_out[s_to] = omega_tmp[s_from];
++ss;
}
std::cout << "Resulting omega_i:" << std::endl;
for(int i=0;i<Ls_out;i++)
std::cout << omega_out[i] << std::endl;
std::cout << "Test result matches the approximate polynomial found by the Remez" << std::endl;
std::cout << "<x> <remez approx> <poly approx> <diff poly approx remez approx> <exact> <diff poly approx exact>\n";
int npt = 60;
double dlt = lambda_bound/double(npt-1);
for (int i =0; i<npt; i++){
double x = i*dlt;
double r = remez.evaluateApprox(x);
double p = epsilonMobius(x, omega_out);
double e = epsilonMobius(x, omega_in);
std::cout << x<< " " << r << " " << p <<" " <<r-p << " " << e << " " << e-p << std::endl;
}
}
//mobius_param = b+c with b-c=1
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
std::vector<RealD> omega_in(Ls_in, 1./mobius_param);
computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound);
}
//ZMobius class takes gamma_i = (b+c) omega_i as its input, where b, c are factored out
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out,
const RealD mobius_param_out, const int Ls_out,
const RealD mobius_param_in, const int Ls_in,
const RealD lambda_bound){
computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound);
for(int i=0;i<Ls_out;i++) gamma_out[i] = gamma_out[i] * mobius_param_out;
}
//Assumes mobius_param_out == mobius_param_in
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound);
}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);

View File

@ -0,0 +1,57 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/ZMobius.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ZMOBIUS_APPROX_H
#define GRID_ZMOBIUS_APPROX_H
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
//Compute the Zmobius Omega parameters suitable for eigenvalue range -lambda_bound <= lambda <= lambda_bound
//Note omega_i = 1/(b_i + c_i) where b_i and c_i are the Mobius parameters
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound);
//mobius_param = b+c with b-c=1
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
//ZMobius class takes gamma_i = (b+c) omega_i as its input, where b, c are factored out
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out,
const RealD mobius_param_out, const int Ls_out,
const RealD mobius_param_in, const int Ls_in,
const RealD lambda_bound);
//Assumes mobius_param_out == mobius_param_in
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@ -58,8 +58,8 @@
/* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */
namespace Grid {
namespace Approx {
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type;
@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to PRECISION for external use only */
/* Converting everything to ZOLO_PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
}
zolotarev_data* higham(PRECISION epsilon, int n) {
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -493,45 +493,47 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
return zd;
}
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#ifdef TEST
#undef ZERO
#define ZERO ((PRECISION) 0)
#define ZERO ((ZOLO_PRECISION) 0)
#undef ONE
#define ONE ((PRECISION) 1)
#define ONE ((ZOLO_PRECISION) 1)
#undef TWO
#define TWO ((PRECISION) 2)
#define TWO ((ZOLO_PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R;
ZOLO_PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
@ -549,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
/* Evaluate the rational approximation R(x) using the partial fraction form */
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> alpha[rdata -> da - 1];
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@ -566,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> beta[0] * x;
ZOLO_PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION T;
ZOLO_PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
@ -585,6 +587,7 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
return (ONE - T) / (ONE + T);
}
/* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined):
*
@ -604,7 +607,7 @@ int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
PRECISION y;
ZOLO_PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
@ -623,13 +626,13 @@ int main(int argc, char** argv) {
}
rdata = type == 2
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
? higham((ZOLO_PRECISION) eps, n)
: zolotarev((ZOLO_PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
@ -678,15 +681,15 @@ int main(int argc, char** argv) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((PRECISION) x, rdata);
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));
@ -723,5 +726,5 @@ int main(int argc, char** argv) {
return EXIT_SUCCESS;
}
#endif /* TEST */

View File

@ -1,18 +1,18 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus
namespace Grid {
namespace Approx {
#include <Grid/Namespace.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
#endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
#ifndef ZOLO_PRECISION
#define ZOLO_PRECISION double
#endif
#define ZPRECISION PRECISION
#define ZPRECISION ZOLO_PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
@ -77,11 +77,13 @@ typedef struct {
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
#ifdef __cplusplus
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@ -10,10 +10,12 @@
#ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H
#define __GMP_WITHIN_CONFIGURE
#include <gmp.h>
#include <mpf2mpfr.h>
#include <mpfr.h>
#undef __GMP_WITHIN_CONFIGURE
class bigfloat {
private:

View File

@ -25,6 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef INCLUDED_BIGFLOAT_DOUBLE_H
#define INCLUDED_BIGFLOAT_DOUBLE_H
#include <math.h>
typedef double mfloat;
@ -186,4 +190,6 @@ public:
// friend bigfloat& random(void);
};
#endif

View File

@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_BEGIN(Grid);
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
NAMESPACE_END(Grid);

View File

@ -0,0 +1,895 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#ifdef GRID_HIP
#include <hipblas/hipblas.h>
#endif
#ifdef GRID_CUDA
#include <cublas_v2.h>
#endif
#ifdef GRID_SYCL
#include <oneapi/mkl.hpp>
#endif
#if 0
#define GRID_ONE_MKL
#endif
#ifdef GRID_ONE_MKL
#include <oneapi/mkl.hpp>
#endif
///////////////////////////////////////////////////////////////////////
// Need to rearrange lattice data to be in the right format for a
// batched multiply. Might as well make these static, dense packed
///////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP
typedef hipblasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_CUDA
typedef cublasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_SYCL
typedef sycl::queue *gridblasHandle_t;
#endif
#ifdef GRID_ONE_MKL
typedef sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t;
#endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
class GridBLAS {
public:
static gridblasHandle_t gridblasHandle;
static int gridblasInit;
static void Init(void)
{
if ( ! gridblasInit ) {
#ifdef GRID_CUDA
std::cout << "cublasCreate"<<std::endl;
cublasCreate(&gridblasHandle);
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
#endif
#ifdef GRID_HIP
std::cout << "hipblasCreate"<<std::endl;
hipblasCreate(&gridblasHandle);
#endif
#ifdef GRID_SYCL
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
sycl::gpu_selector selector;
sycl::device selectedDevice { selector };
sycl::property_list q_prop{sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif
gridblasInit=1;
}
}
// Force construct once
GridBLAS() { Init(); };
~GridBLAS() { };
/////////////////////////////////////////////////////////////////////////////////////
// BLAS GEMM conventions:
/////////////////////////////////////////////////////////////////////////////////////
// - C = alpha A * B + beta C
// Dimensions:
// - C_m.n
// - A_m.k
// - B_k.n
// - Flops = 8 M N K
// - Bytes = 2*sizeof(word) * (MN+MK+KN)
// M=60, N=12
// Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
/////////////////////////////////////////////////////////////////////////////////////
void synchronise(void)
{
#ifdef GRID_HIP
auto err = hipDeviceSynchronize();
assert(err==hipSuccess);
#endif
#ifdef GRID_CUDA
auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess);
#endif
#ifdef GRID_SYCL
accelerator_barrier();
#endif
#ifdef GRID_ONE_MKL
gridblasHandle->wait();
#endif
}
void gemmBatched(int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexD> alpha_p(1);
static deviceVector<ComplexD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
RealD t0=usecond();
// std::cout << "ZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex **)&Amk[0], lda,
(hipblasDoubleComplex **)&Bkn[0], ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex **)&Amk[0], lda,
(cuDoubleComplex **)&Bkn[0], ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexD *) &alpha_p[0],
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexD *) &beta_p[0],
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#if 0
// This code was used to check the mat mul on Sunspot/OneMKL
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
std::vector<ComplexD> A(m*k); // pointer list to matrices
std::vector<ComplexD> B(k*n);
std::vector<ComplexD> C(m*n);
// int sda = lda*k;
// int sdb = ldb*k;
// int sdc = ldc*n;
std::cerr << " Checking the GEMM results "<<std::endl;
for (int p = 0; p < 1; ++p) {
ComplexD * Amk_p; // pointer list to matrices
ComplexD * Bkn_p; // pointer list to matrices
ComplexD * Cmn_p; // pointer list to matrices
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
std::cerr << " p " << p << " copied pointers "<<std::endl;
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
std::cerr << " p " << p << " copied matrices "<<std::endl;
std::cerr << " C[0] "<<C[0]<<std::endl;
std::cerr << " A[0] "<<A[0]<<std::endl;
std::cerr << " B[0] "<<B[0]<<std::endl;
std::cerr << " m "<<m<<std::endl;
std::cerr << " n "<<n<<std::endl;
std::cerr << " k "<<k<<std::endl;
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk) {
int idx_a, idx_b;
// int lda = m; // m x k column major
// int ldb = k; // k x n column major
// int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N) {
idx_a =kk + mm*lda;
} else {
idx_a =mm + kk*lda;
}
if(OpB!=GridBLAS_OP_N) {
idx_b =nn + kk*ldb;
} else {
idx_b =kk + nn*ldb;
}
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
ComplexD Ac = A[idx_a];
ComplexD Bc = B[idx_b];
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
c_mn += Ac*Bc;
}
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
}
}
}
#endif
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
// std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexF> alpha_p(1);
static deviceVector<ComplexF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex **)&Amk[0], lda,
(hipblasComplex **)&Bkn[0], ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex **)&Amk[0], lda,
(cuComplex **)&Bkn[0], ldb,
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Single precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealF> alpha_p(1);
static deviceVector<RealF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealD> alpha_p(1);
static deviceVector<RealD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasDgemmBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasDgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(const double **)&Amk[0], (const int64_t *)&lda64,
(const double **)&Bkn[0], (const int64_t *)&ldb64,
(double *) &beta_p[0],
(double **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
template<class CComplex>
double benchmark(int M, int N, int K, int BATCH)
{
int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH;
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
CComplex alpha(1.0);
CComplex beta (1.0);
RealD flops = 8.0*M*N*K*BATCH;
int ncall=1000;
deviceVector<CComplex *> As(BATCH);
deviceVector<CComplex *> Bs(BATCH);
deviceVector<CComplex *> Cs(BATCH);
for(int b = 0 ; b < BATCH;b++) {
CComplex *ptr;
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
}
// Warm up call
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
}
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
};
NAMESPACE_END(Grid);

View File

@ -33,12 +33,19 @@ namespace Grid {
template<class Field>
class ZeroGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = zero; };
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
};
template<class Field>
class DoNothingGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { };
};
template<class Field>
class SourceGuesser: public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = src; };
};
@ -50,20 +57,29 @@ class DeflatedGuesser: public LinearFunction<Field> {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
const unsigned int N;
public:
using LinearFunction<Field>::operator();
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
: DeflatedGuesser(_evec, _eval, _evec.size())
{}
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N)
{
assert(evec.size()==eval.size());
assert(N <= evec.size());
}
virtual void operator()(const Field &src,Field &guess) {
guess = zero;
assert(evec.size()==eval.size());
auto N = evec.size();
guess = Zero();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
guess.checkerboard = src.checkerboard;
guess.Checkerboard() = src.Checkerboard();
}
};
@ -75,6 +91,7 @@ private:
const std::vector<RealD> &eval_coarse;
public:
using LinearFunction<FineField>::operator();
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
@ -86,17 +103,53 @@ public:
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
CoarseField src_coarse(evec_coarse[0].Grid());
CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero();
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
guess.checkerboard = src.checkerboard;
guess.Checkerboard() = src.Checkerboard();
};
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
int Nevec = (int)evec_coarse.size();
int Nsrc = (int)src.size();
// make temp variables
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
//Preporcessing
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
guess_coarse[j] = Zero();
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockProject(src_coarse[j],src[j],subspace);
}
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
for (int i=0;i<Nevec;i++)
{
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
const CoarseField & tmp = evec_coarse[i];
for (int j=0;j<Nsrc;j++)
{
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
}
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
};
};

View File

@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,513 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSDeflation.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiRHS block projection
Import basis -> nblock x nbasis x (block x internal)
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
=> coarse_(nrhs x nbasis )^block = via batched GEMM
//template<class vobj,class CComplex,int nbasis,class VLattice>
//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
// const VLattice &fineData,
// const VLattice &Basis)
*/
template<class Field>
class MultiRHSBlockProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef Field Fermion;
int nbasis;
GridBase *coarse_grid;
GridBase *fine_grid;
uint64_t block_vol;
uint64_t fine_vol;
uint64_t coarse_vol;
uint64_t words;
// Row major layout "C" order:
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
/*
* in Fortran column major notation (cuBlas order)
*
* Vxb = [v1(x)][..][vn(x)] ... x coarse vol
*
* Fxr = [r1(x)][..][rm(x)] ... x coarse vol
*
* Block project:
* C_br = V^dag F x coarse vol
*
* Block promote:
* F_xr = Vxb Cbr x coarse_vol
*/
deviceVector<scalar> BLAS_V; // words * block_vol * nbasis x coarse_vol
deviceVector<scalar> BLAS_F; // nrhs x fine_vol * words -- the sources
deviceVector<scalar> BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs
RealD blasNorm2(deviceVector<scalar> &blas)
{
scalar ss(0.0);
std::vector<scalar> tmp(blas.size());
acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
for(int64_t s=0;s<blas.size();s++){
ss=ss+tmp[s]*adj(tmp[s]);
}
coarse_grid->GlobalSum(ss);
return real(ss);
}
MultiRHSBlockProject(){};
~MultiRHSBlockProject(){ Deallocate(); };
void Deallocate(void)
{
nbasis=0;
coarse_grid=nullptr;
fine_grid=nullptr;
fine_vol=0;
block_vol=0;
coarse_vol=0;
words=0;
BLAS_V.resize(0);
BLAS_F.resize(0);
BLAS_C.resize(0);
}
void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
{
nbasis=_nbasis;
fine_grid=_fgrid;
coarse_grid=_cgrid;
fine_vol = fine_grid->lSites();
coarse_vol = coarse_grid->lSites();
block_vol = fine_vol/coarse_vol;
words = sizeof(scalar_object)/sizeof(scalar);
BLAS_V.resize (fine_vol * words * nbasis );
}
void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
assert(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
}
uint64_t sz = blas.size();
acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
Coordinate fine_rdimensions = fine_grid->_rdimensions;
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
int64_t bv= block_vol;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
autoView( fineData , vecs[v], AcceleratorRead);
auto blasData_p = &blas[0];
auto fineData_p = &fineData[0];
int64_t osites = fine_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
// One thread per fine site
Coordinate coor_f(_ndimension);
Coordinate coor_b(_ndimension);
Coordinate coor_c(_ndimension);
// Fine site to fine coor
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
int sc;// coarse site
int sb;// block site
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
scalar_object data = extractLane(lane,fineData[sf]);
// BLAS layout address calculation
// words * block_vol * nbasis x coarse_vol
// coarse oSite x block vole x lanes
int64_t site = (lane*osites + sc*bv)*nvec
+ v*bv
+ sb;
// assert(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
*ptr = data;
#ifdef GRID_SIMT
}
#else
}
#endif
});
// std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
// std::cout << " BlockProjector imported vector"<<v<<std::endl;
}
}
void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
{
typedef typename Field::vector_object vobj;
int nvec = vecs.size();
assert(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
}
Coordinate fine_rdimensions = fine_grid->_rdimensions;
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
// std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
int64_t bv= block_vol;
for(int v=0;v<vecs.size();v++){
autoView( fineData , vecs[v], AcceleratorWrite);
auto blasData_p = &blas[0];
auto fineData_p = &fineData[0];
int64_t osites = fine_grid->oSites();
uint64_t lwords = words;
// std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
// std::cout << " lwords is "<<lwords << std::endl;
// std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
// loop over fine sites
accelerator_for(sf,osites,vobj::Nsimd(),{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
#else
for(int lane=0;lane<vobj::Nsimd();lane++) {
#endif
// One thread per fine site
Coordinate coor_f(_ndimension);
Coordinate coor_b(_ndimension);
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
int sc;
int sb;
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
// BLAS layout address calculation
// words * block_vol * nbasis x coarse_vol
int64_t site = (lane*osites + sc*bv)*nvec
+ v*bv
+ sb;
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
scalar_object data = *ptr;
insertLane(lane,fineData[sf],data);
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
}
template<class vobj>
void ImportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
autoView( coarseData , vecs[v], AcceleratorRead);
auto blasData_p = &blas[0];
auto coarseData_p = &coarseData[0];
int64_t osites = coarse_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
// C_br per site
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
coarse_scalar_object data = extractLane(lane,coarseData[sc]);
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
*ptr = data;
#ifdef GRID_SIMT
}
#else
}
#endif
});
// std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
}
}
template<class vobj>
void ExportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
// std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
autoView( coarseData , vecs[v], AcceleratorWrite);
auto blasData_p = &blas[0];
auto coarseData_p = &coarseData[0];
int64_t osites = coarse_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
coarse_scalar_object data = *ptr;
insertLane(lane,coarseData[sc],data);
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
}
void ImportBasis(std::vector < Field > &vecs)
{
// std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
ImportFineGridVectors(vecs,BLAS_V);
}
template<class cobj>
void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
/////////////////////////////////////////////
// Copy in the multi-rhs sources to same data layout
/////////////////////////////////////////////
// std::cout << "BlockProject import fine"<<std::endl;
ImportFineGridVectors(fine,BLAS_F);
deviceVector<scalar *> Vd(coarse_vol);
deviceVector<scalar *> Fd(coarse_vol);
deviceVector<scalar *> Cd(coarse_vol);
// std::cout << "BlockProject pointers"<<std::endl;
for(int c=0;c<coarse_vol;c++){
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
acceleratorPut(Vd[c],Vh);
acceleratorPut(Fd[c],Fh);
acceleratorPut(Cd[c],Ch);
}
GridBLAS BLAS;
// std::cout << "BlockProject BLAS"<<std::endl;
int64_t vw = block_vol * words;
/////////////////////////////////////////
// C_br = V^dag R
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw,
scalar(1.0),
Vd,
Fd,
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl;
ExportCoarseGridVectors(coarse, BLAS_C);
// std::cout << "BlockProject done"<<std::endl;
}
template<class cobj>
void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
ImportCoarseGridVectors(coarse, BLAS_C);
GridBLAS BLAS;
deviceVector<scalar *> Vd(coarse_vol);
deviceVector<scalar *> Fd(coarse_vol);
deviceVector<scalar *> Cd(coarse_vol);
for(int c=0;c<coarse_vol;c++){
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
acceleratorPut(Vd[c],Vh);
acceleratorPut(Fd[c],Fh);
acceleratorPut(Cd[c],Ch);
}
/////////////////////////////////////////
// Block promote:
// F_xr = Vxb Cbr (x coarse_vol)
/////////////////////////////////////////
int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis,
scalar(1.0),
Vd,
Cd,
scalar(0.0), // wipe out C
Fd);
BLAS.synchronise();
// std::cout << " blas call done"<<std::endl;
ExportFineGridVectors(fine, BLAS_F);
// std::cout << " exported "<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,233 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSDeflation.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs projection
i) MultiRHS Deflation
Import Evecs -> nev x vol x internal
Import vector of Lattice objects -> nrhs x vol x internal
=> Cij (nrhs x Nev) via GEMM.
=> Guess (nrhs x vol x internal) = C x evecs (via GEMM)
Export
ii) MultiRHS block projection
Import basis -> nblock x nbasis x (block x internal)
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
=> coarse_(nrhs x nbasis )^block = via batched GEMM
iii) Alternate interface:
Import higher dim Lattice object-> vol x nrhs layout
*/
template<class Field>
class MultiRHSDeflation
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
int nev;
std::vector<RealD> eval;
GridBase *grid;
uint64_t vol;
uint64_t words;
deviceVector<scalar> BLAS_E; // nev x vol -- the eigenbasis (up to a 1/sqrt(lambda))
deviceVector<scalar> BLAS_R; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_G; // nrhs x vol -- the guess
deviceVector<scalar> BLAS_C; // nrhs x nev -- the coefficients
MultiRHSDeflation(){};
~MultiRHSDeflation(){ Deallocate(); };
void Deallocate(void)
{
nev=0;
grid=nullptr;
vol=0;
words=0;
BLAS_E.resize(0);
BLAS_R.resize(0);
BLAS_C.resize(0);
BLAS_G.resize(0);
}
void Allocate(int _nev,GridBase *_grid)
{
nev=_nev;
grid=_grid;
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
eval.resize(nev);
BLAS_E.resize (vol * words * nev );
std::cout << GridLogMessage << " Allocate for "<<nev<<" eigenvectors and volume "<<vol<<std::endl;
}
void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
assert(ev<eval.size());
eval[ev] = _eval;
int64_t offset = ev*vol*words;
autoView(v,evec,AcceleratorRead);
acceleratorCopyDeviceToDevice(&v[0],&BLAS_E[offset],sizeof(scalar_object)*vol);
}
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval)
{
ImportEigenBasis(evec,_eval,0,evec.size());
}
// Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{
assert(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid());
// Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1
for(int e=0;e<nev;e++){
std::cout << "Importing eigenvector "<<e<<" evalue "<<_eval[_ev0+e]<<std::endl;
ImportEigenVector(evec[_ev0+e],_eval[_ev0+e],e);
}
}
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{
int nrhs = source.size();
assert(source.size()==guess.size());
assert(grid == guess[0].Grid());
conformable(guess[0],source[0]);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nev * nrhs);// cost free if size doesn't change
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
// for(int r=0;r<nrhs;r++){
// std::cout << " source["<<r<<"] = "<<norm2(source[r])<<std::endl;
// }
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(v,source[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&v[0],&BLAS_R[offset],sizeof(scalar_object)*vol);
}
/*
* in Fortran column major notation (cuBlas order)
*
* Exe = [e1(x)][..][en(x)]
*
* Rxr = [r1(x)][..][rm(x)]
*
* C_er = E^dag R
* C_er = C_er / lambda_e
* G_xr = Exe Cer
*/
deviceVector<scalar *> Ed(1);
deviceVector<scalar *> Rd(1);
deviceVector<scalar *> Cd(1);
deviceVector<scalar *> Gd(1);
scalar * Eh = & BLAS_E[0];
scalar * Rh = & BLAS_R[0];
scalar * Ch = & BLAS_C[0];
scalar * Gh = & BLAS_G[0];
acceleratorPut(Ed[0],Eh);
acceleratorPut(Rd[0],Rh);
acceleratorPut(Cd[0],Ch);
acceleratorPut(Gd[0],Gh);
GridBLAS BLAS;
/////////////////////////////////////////
// C_er = E^dag R
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw,
scalar(1.0),
Ed,
Rd,
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
assert(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nev*nrhs);
for(int e=0;e<nev;e++){
RealD lam(1.0/eval[e]);
for(int r=0;r<nrhs;r++){
int off = e+nev*r;
HOST_C[off]=HOST_C[off] * lam;
// std::cout << "C["<<e<<"]["<<r<<"] ="<<HOST_C[off]<< " eval[e] "<<eval[e] <<std::endl;
}
}
acceleratorCopyToDevice(&HOST_C[0],&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/////////////////////////////////////////
// Guess G_xr = Exe Cer
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev,
scalar(1.0),
Ed, // x . nev
Cd, // nev . nrhs
scalar(0.0),
Gd);
BLAS.synchronise();
///////////////////////////////////////
// Copy out the multirhs
///////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(v,guess[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
}
RealD t1 = usecond();
std::cout << GridLogMessage << "MultiRHSDeflation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@ -33,109 +33,111 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Deflation methods considered
* -- Solve P A x = P b [ like Luscher ]
* DEF-1 M P A x = M P b [i.e. left precon]
* DEF-2 P^T M A x = P^T M b
* ADEF-1 Preconditioner = M P + Q [ Q + M + M A Q]
* ADEF-2 Preconditioner = P^T M + Q
* BNN Preconditioner = P^T M P + Q
* BNN2 Preconditioner = M P + P^TM +Q - M P A M
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
* Vout = x
*/
NAMESPACE_BEGIN(Grid);
// abstract base
template<class Field, class CoarseField>
class TwoLevelFlexiblePcg : public LinearFunction<Field>
template<class Field>
class TwoLevelCG : public LinearFunction<Field>
{
public:
int verbose;
RealD Tolerance;
Integer MaxIterations;
const int mmax = 5;
GridBase *grid;
GridBase *coarsegrid;
LinearOperatorBase<Field> *_Linop
OperatorFunction<Field> *_Smoother,
LinearFunction<CoarseField> *_CoarseSolver;
// Need somthing that knows how to get from Coarse to fine and back again
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
// more most opertor functions
TwoLevelFlexiblePcg(RealD tol,
Integer maxit,
LinearOperatorBase<Field> *Linop,
LinearOperatorBase<Field> *SmootherLinop,
OperatorFunction<Field> *Smoother,
OperatorFunction<CoarseField> CoarseLinop
) :
TwoLevelCG(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_Linop(Linop),
_PreconditionerLinop(PrecLinop),
_Preconditioner(Preconditioner)
{
verbose=0;
_FineLinop(FineLinop),
_Smoother(Smoother)
{
grid = fine;
};
// The Pcg routine is common to all, but the various matrices differ from derived
// implementation to derived implmentation
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.checkerboard = src.checkerboard;
grid = src._grid;
virtual void operator() (const Field &src, Field &x)
{
std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"<<std::endl;
RealD f;
RealD rtzp,rtz,a,d,b;
RealD rptzp;
RealD tn;
RealD guess = norm2(psi);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
/////////////////////////////
// Set up history vectors
/////////////////////////////
std::vector<Field> p (mmax,grid);
int mmax = 5;
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
std::vector<Field> p(mmax,grid);
std::vector<Field> mmp(mmax,grid);
std::vector<RealD> pAp(mmax);
Field x (grid); x = psi;
Field z (grid);
Field z(grid);
Field tmp(grid);
Field r (grid);
Field mu (grid);
Field mp (grid);
Field r (grid);
Field mu (grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
//Initial residual computation & set up
RealD guess = norm2(x);
std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
RealD src_nrm = norm2(src);
std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
if ( src_nrm == 0.0 ) {
std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
x=Zero();
}
RealD tn;
GridStopWatch HDCGTimer;
HDCGTimer.Start();
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
x=src;
Vstart(x,src);
// r0 = b -A x0
HermOp(x,mmp); // Shouldn't this be something else?
_FineLinop.HermOp(x,mmp[0]);
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
{
double n1 = norm2(x);
double n2 = norm2(mmp[0]);
double n3 = norm2(r);
std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
M1(r,z,tmp,mp,SmootherMirs);
PcgM1(r,z);
rtzp =real(innerProduct(r,z));
///////////////////////////////////////
// Solve for Mss mu = P A z and set p = z-mu
// Def2: p = 1 - Q Az = Pright z
// Def2 p = 1 - Q Az = Pright z
// Other algos M2 is trivial
///////////////////////////////////////
M2(z,p[0]);
PcgM2(z,p[0]);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
Field pp(grid);
for (int k=0;k<=MaxIterations;k++){
@ -143,31 +145,46 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
int peri_kp = (k+1) % mmax;
rtz=rtzp;
d= M3(p[peri_k],mp,mmp[peri_k],tmp);
d= PcgM3(p[peri_k],mmp[peri_k]);
a = rtz/d;
// Memorise this
pAp[peri_k] = d;
axpy(x,a,p[peri_k],x);
RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
// Compute z = M x
M1(r,z,tmp,mp);
PcgM1(r,z);
{
RealD n1,n2;
n1=norm2(r);
n2=norm2(z);
std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
}
rtzp =real(innerProduct(r,z));
std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
// PcgM2(z,p[0]);
PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
p[peri_kp]=mu;
p[peri_kp]=p[peri_k];
// Standard search direction p -> z + b p ; b =
// Standard search direction p -> z + b p
b = (rtzp)/rtz;
int northog;
// k=zero <=> peri_kp=1; northog = 1
// k=1 <=> peri_kp=2; northog = 2
// ... ... ...
// k=mmax-2<=> peri_kp=mmax-1; northog = mmax-1
// k=mmax-1<=> peri_kp=0; northog = 1
// northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
@ -176,75 +193,324 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
}
RealD rrn=sqrt(rn/ssq);
std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
RealD rtn=sqrt(rtz/ssq);
RealD rtnp=sqrt(rtzp/ssq);
std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
// Stopping condition
if ( rn <= rsq ) {
HermOp(x,mmp); // Shouldn't this be something else?
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
_FineLinop.HermOp(x,mmp[0]);
axpy(tmp,-1.0,src,mmp[0]);
RealD psinorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
return k;
RealD mmpnorm = sqrt(norm2(mmp[0]));
RealD xnorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
return;
}
}
// Non-convergence
assert(0);
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
RealD xnorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
src[0].Grid()->Barrier();
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
src[0].Grid()->Barrier();
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
HDCGTimer.Start();
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
a[rhs] = rtz[rhs]/d[rhs];
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
}
// Compute z = M x (for *all* RHS)
PcgM1(r,z);
std::cout << GridLogMessage<<"HDCG::fPcg M1 complete"<<std::endl;
grid->Barrier();
RealD max_rn=0.0;
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
}
public:
virtual void M(Field & in,Field & out,Field & tmp) {
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
{
std::cout << "PcgM1 default (cheat) mrhs version"<<std::endl;
for(int rhs=0;rhs<in.size();rhs++){
this->PcgM1(in[rhs],out[rhs]);
}
}
virtual void PcgM1(Field & in, Field & out) =0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
std::cout << "Vstart default (cheat) mrhs version"<<std::endl;
for(int rhs=0;rhs<x.size();rhs++){
this->Vstart(x[rhs],src[rhs]);
}
}
virtual void Vstart(Field & x,const Field & src)=0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
}
virtual void M1(Field & in, Field & out) {// the smoother
virtual RealD PcgM3(const Field & p, Field & mmp){
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout.
/////////////////////////////////////////////////////////////////////
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2 : public TwoLevelCG<Field>
{
public:
///////////////////////////////////////////////////////////////////////////////////
// Need something that knows how to get from Coarse to fine and back again
// void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
// void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
///////////////////////////////////////////////////////////////////////////////////
GridBase *coarsegrid;
Aggregation &_Aggregates;
LinearFunction<CoarseField> &_CoarseSolver;
LinearFunction<CoarseField> &_CoarseSolverPrecise;
///////////////////////////////////////////////////////////////////////////////////
// more most opertor functions
TwoLevelADEF2(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates
) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
_CoarseSolver(CoarseSolver),
_CoarseSolverPrecise(CoarseSolverPrecise),
_Aggregates(Aggregates)
{
coarsegrid = Aggregates.CoarseGrid;
};
virtual void PcgM1(Field & in, Field & out)
{
GRID_TRACE("MultiGridPreconditioner ");
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid);
Field Min(grid);
PcgM(in,Min); // Smoother call
Field tmp(this->grid);
Field Min(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
HermOp(Min,out);
GridStopWatch SmootherTimer;
GridStopWatch MatrixTimer;
SmootherTimer.Start();
this->_Smoother(in,Min);
SmootherTimer.Stop();
MatrixTimer.Start();
this->_FineLinop.HermOp(Min,out);
MatrixTimer.Stop();
axpy(tmp,-1.0,out,in); // tmp = in - A Min
ProjectToSubspace(tmp,PleftProj);
ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
GridStopWatch ProjTimer;
GridStopWatch CoarseTimer;
GridStopWatch PromTimer;
ProjTimer.Start();
this->_Aggregates.ProjectToSubspace(PleftProj,tmp);
ProjTimer.Stop();
CoarseTimer.Start();
this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
CoarseTimer.Stop();
PromTimer.Start();
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
PromTimer.Stop();
std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tSmoother " << SmootherTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProj " << ProjTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tCoarse " << CoarseTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProm " << PromTimer.Elapsed() <<std::endl;
axpy(out,1.0,Min,tmp); // Min+tmp
}
virtual void M2(const Field & in, Field & out) {
out=in;
// Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
}
virtual RealD M3(const Field & p, Field & mmp){
double d,dd;
HermOpAndNorm(p,mmp,d,dd);
return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
}
virtual void VstartDef2(Field & xconst Field & src){
//case PcgDef2:
//case PcgAdef2:
//case PcgAdef2f:
//case PcgV11f:
virtual void Vstart(Field & x,const Field & src)
{
std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl;
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@ -256,142 +522,78 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
Field r(grid);
Field mmp(grid);
HermOp(x,mmp);
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
ProjectToSubspace(r,PleftProj);
ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
PromoteFromSubspace(PleftMss_proj,mmp);
x=x+mmp;
Field r(this->grid);
Field mmp(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl;
this->_Aggregates.ProjectToSubspace(PleftProj,src);
std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl;
this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);
}
};
template<class Field>
class TwoLevelADEF1defl : public TwoLevelCG<Field>
{
public:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
TwoLevelADEF1defl(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
std::vector<Field> &_evec,
std::vector<RealD> &_eval) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
evec(_evec),
eval(_eval)
{};
// Can just inherit existing M2
// Can just inherit existing M3
// Simple vstart - do nothing
virtual void Vstart(Field & x,const Field & src){
return;
x=src; // Could apply Q
};
// Override PcgM1
virtual void PcgM1(Field & in, Field & out)
{
GRID_TRACE("EvecPreconditioner ");
int N=evec.size();
Field Pin(this->grid);
Field Qin(this->grid);
//MP + Q = M(1-AQ) + Q = M
// // If we are eigenvector deflating in coarse space
// // Q = Sum_i |phi_i> 1/lambda_i <phi_i|
// // A Q = Sum_i |phi_i> <phi_i|
// // M(1-AQ) = M(1-proj) + Q
Qin.Checkerboard()=in.Checkerboard();
Qin = Zero();
Pin = in;
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
auto ip = TensorRemove(innerProduct(tmp,in));
axpy(Qin, ip / eval[i],tmp,Qin);
axpy(Pin, -ip ,tmp,Pin);
}
this->_Smoother(Pin,out);
out = out + Qin;
}
};
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
}
NAMESPACE_END(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out){
// P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ]
Field in_sbar(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
HermOp(in_sbar,out);
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
}
virtual void Pleft (Field & in,Field & out){
// P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ]
Field in_sbar(grid);
Field tmp2(grid);
Field Mtmp(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
PromoteFromSubspace(PleftMss_proj,out);
HermOp(out,Mtmp);
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
}
}
template<class Field>
class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp){
}
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
}
virtual void M2(Field & in, Field & out){
}
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
}
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
}
}
/*
template<class Field>
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif

View File

@ -0,0 +1,734 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/AdefGeneric.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
/*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
*/
NAMESPACE_BEGIN(Grid);
template<class Field>
class TwoLevelCGmrhs
{
public:
RealD Tolerance;
Integer MaxIterations;
GridBase *grid;
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer;
GridStopWatch DeflateTimer;
GridStopWatch CoarseTimer;
GridStopWatch FineTimer;
GridStopWatch SmoothTimer;
GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions
TwoLevelCGmrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{
grid = fine;
};
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
assert(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
// std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
M3Timer.Start();
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
M3Timer.Stop();
a[rhs] = rtz[rhs]/d[rhs];
LinalgTimer.Start();
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
LinalgTimer.Stop();
}
// Compute z = M x (for *all* RHS)
M1Timer.Start();
PcgM1(r,z);
M1Timer.Stop();
RealD max_rn=0.0;
LinalgTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
// std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG:fPcg rhs "<<rhs<<" k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
LinalgTimer.Stop();
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : fine M3 "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
}
public:
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out) = 0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src) = 0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
}
virtual RealD PcgM3(const Field & p, Field & mmp){
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
};
template<class Field, class CoarseField>
class TwoLevelADEF2mrhs : public TwoLevelCGmrhs<Field>
{
public:
GridBase *coarsegrid;
GridBase *coarsegridmrhs;
LinearFunction<CoarseField> &_CoarseSolverMrhs;
LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
MultiRHSBlockProject<Field> &_Projector;
MultiRHSDeflation<CoarseField> &_Deflator;
TwoLevelADEF2mrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolverMrhs,
LinearFunction<CoarseField> &CoarseSolverPreciseMrhs,
MultiRHSBlockProject<Field> &Projector,
MultiRHSDeflation<CoarseField> &Deflator,
GridBase *_coarsemrhsgrid) :
TwoLevelCGmrhs<Field>(tol, maxit,FineLinop,Smoother,Projector.fine_grid),
_CoarseSolverMrhs(CoarseSolverMrhs),
_CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
_Projector(Projector),
_Deflator(Deflator)
{
coarsegrid = Projector.coarse_grid;
coarsegridmrhs = _coarsemrhsgrid;// Thi could be in projector
};
// Override Vstart
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
int nrhs=x.size();
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
// = [1 - Ass_inv A] Guess + Assinv src
// = P^T guess + Assinv src
// = Vstart [Tang notation]
// This gives:
// W^T (src - A x_0) = src_s - A guess_s - r_s
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
this->_Projector.blockProject(src,PleftProj);
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->_Projector.blockPromote(x,PleftMss_proj);
}
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
int nrhs=in.size();
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
std::vector<Field> tmp(nrhs,this->grid);
std::vector<Field> Min(nrhs,this->grid);
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop();
}
this->ProjectTimer.Start();
this->_Projector.blockProject(tmp,PleftProj);
this->ProjectTimer.Stop();
this->DeflateTimer.Start();
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
this->DeflateTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->InsertTimer.Stop();
this->CoarseTimer.Start();
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
this->CoarseTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->InsertTimer.Stop();
this->PromoteTimer.Start();
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop();
this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
}
// this->zzz=out[0];
this->FineTimer.Stop();
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,234 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BiCGSTAB.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: juettner <juettner@soton.ac.uk>
Author: David Murphy <djmurphy@mit.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BICGSTAB_H
#define GRID_BICGSTAB_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template <class Field>
class BiCGSTAB : public OperatorFunction<Field>
{
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) :
Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi)
{
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1);
RealD a(0), bo(0), b(0), ssq(0);
Field p(src);
Field r(src);
Field rhat(src);
Field v(src);
Field s(src);
Field t(src);
Field h(src);
v = Zero();
p = Zero();
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop.Op(psi, v);
b = norm2(v);
r = src - v;
rhat = r;
a = norm2(r);
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: mp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: r " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if(a <= rsq){ return; }
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++)
{
rho_prev = rho;
LinalgTimer.Start();
InnerTimer.Start();
ComplexD Crho = innerProduct(rhat,r);
InnerTimer.Stop();
rho = Crho.real();
beta = (rho / rho_prev) * (alpha / omega);
LinearCombTimer.Start();
bo = beta * omega;
{
autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
MatrixTimer.Start();
Linop.Op(p,v);
MatrixTimer.Stop();
LinalgTimer.Start();
InnerTimer.Start();
ComplexD Calpha = innerProduct(rhat,v);
InnerTimer.Stop();
alpha = rho / Calpha.real();
LinearCombTimer.Start();
{
autoView( p_v , p, AcceleratorRead);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
autoView( psi_v,psi, AcceleratorRead);
autoView( h_v , h, AcceleratorWrite);
autoView( s_v , s, AcceleratorWrite);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
});
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
MatrixTimer.Start();
Linop.Op(s,t);
MatrixTimer.Stop();
LinalgTimer.Start();
InnerTimer.Start();
ComplexD Comega = innerProduct(t,s);
InnerTimer.Stop();
omega = Comega.real() / norm2(t);
LinearCombTimer.Start();
{
autoView( psi_v,psi, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
autoView( h_v , h, AcceleratorRead);
autoView( s_v , s, AcceleratorRead);
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
}
LinearCombTimer.Stop();
cp = norm2(r);
LinalgTimer.Stop();
std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
// Stopping condition
if(cp <= rsq)
{
SolverTimer.Stop();
Linop.Op(psi, v);
p = v - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown " << std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ assert(0); }
IterationsToComplete = k;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,159 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
Author: David Murphy <djmurphy@mit.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BICGSTAB_MIXED_PREC_H
#define GRID_BICGSTAB_MIXED_PREC_H
NAMESPACE_BEGIN(Grid);
// Mixed precision restarted defect correction BiCGSTAB
template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
{
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; // Grid for single-precision fields
RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid,
LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit),
MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {};
void useGuesser(LinearFunction<FieldF>& g){
guesser = &g;
}
void operator() (const FieldD& src_d_in, FieldD& sol_d)
{
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.Checkerboard();
sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb;
BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++)
{
// Compute double precision rsd and also new RHS vector.
Linop_d.Op(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
sol_f = Zero();
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL){ (*guesser)(src_f, sol_f); }
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
//Final trial CG
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl;
BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop();
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -27,11 +27,61 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
@ -54,6 +104,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer PrintInterval; //GridLogMessages or Iterative
RealD TrueResidual;
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
@ -88,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -111,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R)
{
InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint();
Cinv = C.inverse();
@ -154,12 +223,12 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog];
Nblock = B.Grid()->_fdimensions[Orthog];
/* FAKE */
Nblock=8;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard;
X.Checkerboard() = B.Checkerboard();
conformable(X, B);
Field tmp(B);
@ -187,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@ -222,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q;
@ -237,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer;
SolverTimer.Start();
RealD max_resid=0;
int k;
for (k = 1; k <= MaxIterations; k++) {
@ -281,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
max_resid=0;
RealD rrsum=0;
RealD rr;
@ -308,7 +383,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
AD = AD-B;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
TrueResidual = std::sqrt(norm2(AD)/norm2(B));
std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
@ -334,11 +412,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
Nblock = Src.Grid()->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
Psi.Checkerboard() = Src.Checkerboard();
conformable(Psi, Src);
Field P(Src);
@ -444,7 +522,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
TrueResidual = std::sqrt(norm2(AP)/norm2(Src));
std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@ -465,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
IterationsToComplete = k;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = zero;
for(int bp=0;bp<Nblock;bp++) {
AP[b] += (m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation:
//--------------------------
@ -517,7 +559,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
for(int b=0;b<Nblock;b++){
X[b].checkerboard = B[b].checkerboard;
X[b].Checkerboard() = B[b].Checkerboard();
conformable(X[b], B[b]);
conformable(X[b], X[0]);
}
@ -548,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@ -584,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
}
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@ -655,7 +699,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
@ -670,7 +714,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
TrueResidual = std::sqrt(normv(AD)/normv(B));
std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@ -690,9 +735,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
IterationsToComplete = k;
}
};
}
#endif
NAMESPACE_END(Grid);

View File

@ -0,0 +1,248 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the CAGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
CommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: src " << ssq << std::endl;
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
ComplexD scale = 1.0/gamma[0];
v[0] = scale * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(v, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
MatrixTimer.Start();
LinOp.Op(v[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + v[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@ -27,56 +27,95 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template <class Field>
class ConjugateGradient : public OperatorFunction<Field> {
public:
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
RealD TrueResidual;
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv)
{};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard();
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src);
Field mmp(src);
Field r(src);
// Was doing copies
ConstructTimer.Start();
Field p (src.Grid());
Field mmp(src.Grid());
Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
NormTimer.Start();
ssq = norm2(src);
RealD guess = norm2(psi);
NormTimer.Stop();
assert(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) {
r = src;
p = r;
a = ssq;
} else {
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
}
cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src
if (ssq == 0.){
psi = Zero();
IterationsToComplete = 1;
TrueResidual = 0.;
return;
}
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl;
@ -89,12 +128,16 @@ class ConjugateGradient : public OperatorFunction<Field> {
// Check if guess is really REALLY good :)
if (cp <= rsq) {
TrueResidual = std::sqrt(a/ssq);
std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
IterationsToComplete = 0;
return;
}
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
PreambleTimer.Stop();
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
@ -102,9 +145,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
RealD usecs = -usecond();
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations*1000; k++) {
for (k = 1; k <= MaxIterations; k++) {
GridStopWatch IterationTimer;
IterationTimer.Start();
c = cp;
MatrixTimer.Start();
@ -125,53 +172,202 @@ class ConjugateGradient : public OperatorFunction<Field> {
b = cp / c;
LinearCombTimer.Start();
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
vstream(psi[ss], a * p[ss] + psi[ss]);
vstream(p [ss], b * p[ss] + r[ss]);
{
autoView( psi_v , psi, AcceleratorWrite);
autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
LogIteration(k,a,b);
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
IterationTimer.Stop();
if ( (k % 500) == 0 ) {
std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
} else {
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
}
// Stopping condition
if (cp <= rsq) {
usecs +=usecond();
SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
GridBase *grid = src.Grid();
RealD DwfFlops = (1452. )*grid->gSites()*4*k
+ (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
<< "\tComputed residual " << std::sqrt(cp / ssq)
<< "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
// std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver Elapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
TrueResidual = true_residual;
return;
}
}
std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
<< std::endl;
// Failed. Calculate true residual before giving up
// Linop.HermOpAndNorm(psi, mmp, d, qq);
// p = mmp - src;
//TrueResidual = sqrt(norm2(p)/ssq);
// TrueResidual = 1;
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage<< "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,17 +23,20 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
@ -46,11 +49,17 @@ namespace Grid {
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
RealD TrueResidual;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
MixedPrecisionConjugateGradient(RealD tol,
Integer maxinnerit,
Integer maxouterit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ };
@ -59,96 +68,103 @@ namespace Grid {
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
TotalInnerIterations = 0;
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.checkerboard;
sol_d.checkerboard = cb;
int cb = src_d_in.Checkerboard();
sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid;
FieldD tmp_d(DoublePrecGrid);
tmp_d.checkerboard = cb;
GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.checkerboard = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance;
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb;
FieldF src_f(SinglePrecGrid);
src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
zeroit(sol_f);
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
sol_f = Zero();
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
};
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TrueResidual = CG_d.TrueResidual;
}
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,147 +24,169 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field>
{
template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field>
{
public:
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
using OperatorFunction<Field>::operator();
// RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src._grid;
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
return;
}
return;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GRID_TRACE("ConjugateGradientMultiShift");
GridBase *grid = src.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GridBase *grid = src._grid;
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// remove dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
Field r(grid);
Field p(grid);
Field tmp(grid);
Field mmp(grid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src);
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid^2 "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
r=src;
p=src;
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
//MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
const int primary =0;
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
b = -cp /d;
// Matrix mult fields
Field r(grid);
Field p(grid);
Field tmp(grid);
Field mmp(grid);
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src);
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
r=src;
p=src;
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
//MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
@ -175,37 +197,37 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
GridStopWatch SolverTimer;
SolverTimer.Start();
// Iteration loop
int k;
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
for (k=1;k<=MaxIterations;k++){
a = c /cp;
a = c /cp;
AXPYTimer.Start();
axpy(p,a,p,r);
axpy(p,a,p,r);
AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
}
}
}
}
AXPYTimer.Stop();
cp=c;
cp=c;
MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL
@ -215,108 +237,110 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp);
axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop();
RealD rn = norm2(p);
d += rn*mass[0];
RealD rn = norm2(p);
d += rn*mass[0];
bp=b;
b=-cp/d;
bp=b;
b=-cp/d;
AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r);
c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop();
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
}
ShiftTimer.Stop();
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
}
if ( all_converged ){
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl;
}
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChange(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChange(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChange(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,416 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//Linop to add shift to input linop, used in cleanup CG
namespace ConjugateGradientMultiShiftMixedPrecSupport{
template<typename Field>
class ShiftedLinop: public LinearOperatorBase<Field>{
public:
LinearOperatorBase<Field> &linop_base;
RealD shift;
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
axpy(out, shift, in, out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
};
};
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f, p_d, pc_wk_d_to_s);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_d[s],a,ps_d[s],r_d);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
}
}
}
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
PrecChangeTimer.Stop();
AXPYTimer.Start();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
AXPYTimer.Stop();
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update double precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
}
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
if(k % ReliableUpdateFreq == 0){
RealD c_old = c;
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
c = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,234 +23,255 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r);
FieldF psi_f(r_f);
psi_f = zero;
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = zero;
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
GRID_TRACE("ConjugateGradientReliableUpdate");
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r, pc_wk_dp_to_sp);
FieldF psi_f(r_f);
psi_f = Zero();
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
GridStopWatch PrecChangeTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
MatrixTimer.Start();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
MatrixTimer.Stop();
r = src - mmp;
psi_f = Zero();
PrecChangeTimer.Start();
precisionChange(r_f, r, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,88 +24,90 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
template<class Field>
class ConjugateResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
RealD Tolerance;
Integer MaxIterations;
int verbose;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
RealD a, b, c, d;
RealD cp, ssq,rsq;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b; // c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src._grid;
psi=zero;
Field r(grid), p(grid), Ap(grid), Ar(grid);
GridBase *grid = src.Grid();
psi=Zero();
Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src;
p=src;
r=src;
p=src;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<std::sqrt(cp/ssq)
<< " true residual "<<std::sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,258 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
LinearFunction<Field> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
PrecTimer.Start();
Preconditioner(v[iter], z[iter]);
PrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@ -0,0 +1,256 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the FGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
FlexibleGeneralisedMinimalResidual(RealD tol,
Integer maxit,
LinearFunction<Field> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
PrecTimer.Start();
Preconditioner(v[iter], z[iter]);
PrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@ -0,0 +1,244 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the GMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GeneralisedMinimalResidual(RealD tol,
Integer maxit,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: src " << ssq << std::endl;
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(v, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
MatrixTimer.Start();
LinOp.Op(v[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + v[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -35,120 +35,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
//#include <zlib.h>
#include <sys/stat.h>
namespace Grid {
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
parallel_region
{
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss];
}
}
for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j];
}
}
}
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
result.checkerboard = basis[0].checkerboard;
parallel_for(int ss=0;ss < grid->oSites();ss++){
vobj B = zero;
for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss];
}
result._odata[ss] = B;
}
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
@ -192,14 +79,16 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public Imp
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<" target " << eresid*eresid << " conv " <<conv
<<std::endl;
return conv;
}
};
@ -259,7 +148,7 @@ public:
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
int _MinRestart=0, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@ -275,7 +164,7 @@ public:
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
int _MinRestart=0, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@ -289,7 +178,7 @@ public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
nn = std::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
@ -321,10 +210,10 @@ until convergence
*/
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{
GridBase *grid = src._grid;
assert(grid == evec[0]._grid);
GridBase *grid = src.Grid();
assert(grid == evec[0].Grid());
GridLogIRL.TimingMode(1);
// GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@ -349,14 +238,17 @@ until convergence
{
auto src_n = src;
auto tmp = src;
std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n);
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05)
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@ -446,7 +338,7 @@ until convergence
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl;
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
////////////////////////////////////////////////////
// Compressed vector f and beta(k2)
@ -454,7 +346,7 @@ until convergence
f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = sqrt(beta_k);
beta_k = std::sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
@ -477,7 +369,7 @@ until convergence
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.checkerboard = evec[0].checkerboard;
Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
@ -515,7 +407,7 @@ until convergence
converged:
{
Field B(grid); B.checkerboard = evec[0].checkerboard;
Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0;
@ -529,14 +421,15 @@ until convergence
}
}
if ( Nconv < Nstop )
if ( Nconv < Nstop ) {
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
}
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
eval.resize(Nstop);// was Nconv
evec.resize(Nstop,grid);// was Nconv
basisSortInPlace(evec,eval,reverse);
}
@ -554,11 +447,11 @@ until convergence
/* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do:
3. wk:=Avkβkv_{k1}
4. αk:=(wk,vk) //
5. wk:=wkαkvk // wk orthog vk
6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
7. vk+1 := wk/βk+1
3. wk:=Avk - b_k v_{k-1}
4. ak:=(wk,vk) //
5. wk:=wk-akvk // wk orthog vk
6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
7. vk+1 := wk/b_k+1
8. EndDo
*/
void step(std::vector<RealD>& lmd,
@ -566,6 +459,7 @@ until convergence
std::vector<Field>& evec,
Field& w,int Nm,int k)
{
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20;
assert( k< Nm );
@ -573,31 +467,33 @@ until convergence
Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
_PolyOp(evec_k,w); std::cout<<GridLogDebug << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
ComplexD zalph = innerProduct(evec_k,w);
RealD alph = real(zalph);
w = w - alph * evec_k;// 5. wk:=wkαkvk
w = w - alph * evec_k;
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
RealD beta = normalise(w);
lmd[k] = alph;
lme[k] = beta;
if (k>0 && k % orth_period == 0) {
if ( (k>0) && ( (k % orth_period) == 0 )) {
std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl;
}
if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
@ -807,7 +703,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift)
@ -838,5 +734,6 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
abort();
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,16 +24,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
struct LanczosParams : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
@ -45,8 +44,9 @@ struct LanczosParams : Serializable {
int, MinRes); // Must restart
};
//This class is the input parameter class for some testing programs
struct LocalCoherenceLanczosParams : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
@ -59,7 +59,7 @@ struct LocalCoherenceLanczosParams : Serializable {
RealD , coarse_relax_tol,
std::vector<int>, blockSize,
std::string, config,
std::vector < std::complex<double> >, omega,
std::vector < ComplexD >, omega,
RealD, mass,
RealD, M5);
};
@ -68,6 +68,7 @@ struct LocalCoherenceLanczosParams : Serializable {
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -83,14 +84,14 @@ public:
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard;
GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.Checkerboard()= checkerboard;
FineField fout(FineGrid); fout.Checkerboard() = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
@ -98,6 +99,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -117,12 +119,12 @@ public:
{ };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard;
GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
@ -133,7 +135,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{
public:
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -142,18 +144,26 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
//As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
//To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
//out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
//NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
RealD coarse_relax_tol=5.0e3,
int largestEvalIdxForReport=-1)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
_coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
{ };
//evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
@ -176,16 +186,30 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
RealD tmp_eval;
ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
}
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
//This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
//applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
//evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
//As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
//We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
evalMaxApprox = 1.0; //cf above
GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
blockPromote(B,fv,_subspace);
@ -200,13 +224,13 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
eval = vnum/vden;
fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
<<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1;
return 0;
}
@ -284,6 +308,10 @@ public:
evals_coarse.resize(0);
};
//The block inner product is the inner product on the fine grid locally summed over the blocks
//to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
//vectors under the block inner product. This step must be performed after computing the fine grid
//eigenvectors and before computing the coarse grid eigenvectors.
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -305,11 +333,11 @@ public:
int Nk = nbasis;
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard;
subspace[0].Checkerboard()=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
subspace[k].checkerboard=_checkerboard;
subspace[k].Checkerboard()=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
@ -327,6 +355,8 @@ public:
}
}
//While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
@ -360,7 +390,11 @@ public:
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
FineField src(_FineGrid);
typedef typename FineField::scalar_type Scalar;
// src=1.0;
src=Scalar(1.0);
src.Checkerboard() = _checkerboard;
int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false);
@ -371,25 +405,31 @@ public:
evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid);
}
//cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
//cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
//relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
Chebyshev<FineField> Cheby(cheby_op); //Chebyshev of fine operator on fine grid
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
Chebyshev<FineField> ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0;
//Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -400,7 +440,15 @@ public:
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
}
}
//Get the fine eigenvector 'i' by reconstruction
void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
blockPromote(evec_coarse[i],evec,subspace);
eval = evals_coarse[i];
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,157 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MINIMAL_RESIDUAL_H
#define GRID_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
RealD overRelaxParam;
Integer IterationsToComplete; // Number of iterations the MR took to finish.
// Filled in upon completion
MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
: Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
ComplexD a, c;
RealD d;
Field Mr(src);
Field r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Linop.Op(psi, Mr);
r = src - Mr;
RealD cp = norm2(r);
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl;
std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl;
if (cp <= rsq) {
return;
}
std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
MatrixTimer.Start();
Linop.Op(r, Mr);
MatrixTimer.Stop();
LinalgTimer.Start();
c = innerProduct(Mr, r);
d = norm2(Mr);
a = c / d;
a = a * overRelaxParam;
psi = psi + r * a;
r = r - Mr * a;
cp = norm2(r);
LinalgTimer.Stop();
std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
Linop.Op(psi, Mr);
r = src - Mr;
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "MinimalResidual Converged on iteration " << k
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge)
assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MinimalResidual did NOT converge"
<< std::endl;
if (ErrorOnNoConverge)
assert(0);
IterationsToComplete = k;
}
};
} // namespace Grid
#endif

View File

@ -0,0 +1,276 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
public:
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
GridStopWatch ChangePrecTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GridBase* SinglePrecGrid;
LinearFunction<FieldF> &Preconditioner;
MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD tol,
Integer maxit,
GridBase * sp_grid,
LinearFunction<FieldF> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, SinglePrecGrid(sp_grid)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
FieldD r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
std::cout << GridLogIterative << "MPFGMRES: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
ChangePrecTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "MPFGMRES: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " << ChangePrecTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
RealD cp = 0;
FieldD w(src.Grid());
FieldD r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
FieldF v_f(SinglePrecGrid);
FieldF z_f(SinglePrecGrid);
ChangePrecTimer.Start();
precisionChange(v_f, v[iter]);
precisionChange(z_f, z[iter]);
ChangePrecTimer.Stop();
PrecTimer.Start();
Preconditioner(v_f, z_f);
PrecTimer.Stop();
ChangePrecTimer.Start();
precisionChange(z[iter], z_f);
ChangePrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,38 +23,116 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public OperatorFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver) {};
void operator() (const Field &in, Field &out){
void operator() (const Field &in, Field &out){
Field src(in._grid);
Field src(in.Grid());
Field tmp(in.Grid());
_Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Matrix.Mdag(in,src);
_Guess(src,out);
_HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in
}
};
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
}
};
Field res(in.Grid());
Field tmp(in.Grid());
}
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> {
private:
LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
HPDSolver(LinearOperatorBase<Field> &Matrix,
OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
_Guess(in,out);
_HermitianSolver(_Matrix,in,out); //M out = in
}
};
template<class Field> class MdagMSolver : public LinearFunction<Field> {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Guess(in,out);
_HermitianSolver(MdagMOp,in,out); // Mdag M out = Mdag in
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,46 @@
#pragma once
namespace Grid {
template<class Field> class PowerMethod
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_EST_ = 200;
for (int i=0;i<_MAX_ITER_EST_;i++) {
normalise(src_n);
HermOp.HermOp(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
// evalMaxApprox = na;
// return evalMaxApprox;
// }
evalMaxApprox = na;
src_n = tmp;
}
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
};
}

View File

@ -0,0 +1,76 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,97 +23,97 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
LinearFunction<Field> &Preconditioner;
template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
LinearFunction<Field> &Preconditioner;
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{
verbose=1;
};
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src._grid;
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero;
r = src;
Preconditioner(r,p);
psi=zero;
r = src;
Preconditioner(r,p);
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap;
rAr=pAp;
rAAr=pAAp;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap;
rAr=pAp;
rAAr=pAAp;
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=0;k<MaxIterations;k++){
for(int k=0;k<MaxIterations;k++){
Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z));
Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z));
a = rAr/rq;
a = rAr/rq;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r);
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r);
rArp=rAr;
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_H
#define GRID_PREC_GCR_H
@ -36,195 +36,204 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
LinearFunction<Field> &Preconditioner;
template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
verbose=1;
};
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
void Level(int lv) { level=lv; };
psi=zero;
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src._grid);
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(Linop,src,psi,rsq);
cp=GCRnStep(src,psi,rsq);
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
if(cp<rsq) {
SolverTimer.Stop();
SolverTimer.Stop();
Linop.HermOp(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
return;
}
Linop.HermOp(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
/*
GCRLogLevel<<"PGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
*/
return;
}
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0);
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
RealD a, b, c, d;
RealD zAz, zAAz;
RealD rAq, rq;
RealD cp;
RealD a, b;
RealD zAz, zAAz;
RealD rq;
GridBase *grid = src._grid;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
r=src-Az;
/////////////////////
// p = Prec(r)
/////////////////////
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOp(z,tmp);
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
ttmp=tmp;
tmp=tmp-r;
LinalgTimer.Start();
/*
std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl;
*/
q[peri_kp]=Az;
p[peri_kp]=z;
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
if((k==nstep-1)||(cp<rsq)){
return cp;
}
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop();
tmp=tmp-r;
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
assert(0); // never reached
return cp;
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
};
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,242 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
// psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b;
// ComplexD zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,371 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithmsf/iterative/QuasiMinimalResidual.h
Copyright (C) 2019
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
RealD innerG5ProductReal(Field &l, Field &r)
{
Gamma G5(Gamma::Algebra::Gamma5);
Field tmp(l.Grid());
// tmp = G5*r;
G5R5(tmp,r);
ComplexD ip =innerProduct(l,tmp);
std::cout << "innerProductRealG5R5 "<<ip<<std::endl;
return ip.real();
}
template<class Field>
class QuasiMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge;
RealD Tolerance;
Integer MaxIterations;
Integer IterationCount;
QuasiMinimalResidual(RealD tol,
Integer maxit,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, ErrorOnNoConverge(err_on_no_conv)
{};
#if 1
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
RealD resid;
IterationCount=0;
RealD rho, rho_1, xi, gamma, gamma_1, theta, theta_1;
RealD eta, delta, ep, beta;
GridBase *Grid = b.Grid();
Field r(Grid), d(Grid), s(Grid);
Field v(Grid), w(Grid), y(Grid), z(Grid);
Field v_tld(Grid), w_tld(Grid), y_tld(Grid), z_tld(Grid);
Field p(Grid), q(Grid), p_tld(Grid);
Real normb = norm2(b);
LinOp.Op(x,r); r = b - r;
assert(normb> 0.0);
resid = norm2(r)/normb;
if (resid <= Tolerance) {
return;
}
v_tld = r;
y = v_tld;
rho = norm2(y);
// Take Gamma5 conjugate
// Gamma G5(Gamma::Algebra::Gamma5);
// G5R5(w_tld,r);
// w_tld = G5* v_tld;
w_tld=v_tld;
z = w_tld;
xi = norm2(z);
gamma = 1.0;
eta = -1.0;
theta = 0.0;
for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests
assert( rho != 0.0);
assert( xi != 0.0);
v = (1. / rho) * v_tld;
y = (1. / rho) * y;
w = (1. / xi) * w_tld;
z = (1. / xi) * z;
ComplexD Zdelta = innerProduct(z, y); // Complex?
std::cout << "Zdelta "<<Zdelta<<std::endl;
delta = Zdelta.real();
y_tld = y;
z_tld = z;
if (i > 1) {
p = y_tld - (xi * delta / ep) * p;
q = z_tld - (rho * delta / ep) * q;
} else {
p = y_tld;
q = z_tld;
}
LinOp.Op(p,p_tld); // p_tld = A * p;
ComplexD Zep = innerProduct(q, p_tld);
ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit
assert(abs(ep)>0);
beta = ep / delta;
assert(abs(beta)>0);
v_tld = p_tld - beta * v;
y = v_tld;
rho_1 = rho;
rho = norm2(y);
LinOp.AdjOp(q,w_tld);
w_tld = w_tld - beta * w;
z = w_tld;
xi = norm2(z);
gamma_1 = gamma;
theta_1 = theta;
theta = rho / (gamma_1 * beta);
gamma = 1.0 / sqrt(1.0 + theta * theta);
std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
if (i > 1) {
d = eta * p + (theta_1 * theta_1 * gamma * gamma) * d;
s = eta * p_tld + (theta_1 * theta_1 * gamma * gamma) * s;
} else {
d = eta * p;
s = eta * p_tld;
}
x =x+d; // update approximation vector
r =r-s; // compute residual
if ((resid = norm2(r) / normb) <= Tolerance) {
return;
}
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
}
assert(0);
return; // no convergence
}
#else
// QMRg5 SMP thesis
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
// Real scalars
GridBase *grid = b.Grid();
Field r(grid);
Field p_m(grid), p_m_minus_1(grid), p_m_minus_2(grid);
Field v_m(grid), v_m_minus_1(grid), v_m_plus_1(grid);
Field tmp(grid);
RealD w;
RealD z1, z2;
RealD delta_m, delta_m_minus_1;
RealD c_m_plus_1, c_m, c_m_minus_1;
RealD s_m_plus_1, s_m, s_m_minus_1;
RealD alpha, beta, gamma, epsilon;
RealD mu, nu, rho, theta, xi, chi;
RealD mod2r, mod2b;
RealD tau2, target2;
mod2b=norm2(b);
/////////////////////////
// Initial residual
/////////////////////////
LinOp.Op(x,tmp);
r = b - tmp;
/////////////////////////
// \mu = \rho = |r_0|
/////////////////////////
mod2r = norm2(r);
rho = sqrt( mod2r);
mu=rho;
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
/////////////////////////
// Zero negative history
/////////////////////////
v_m_plus_1 = Zero();
v_m_minus_1 = Zero();
p_m_minus_1 = Zero();
p_m_minus_2 = Zero();
// v0
v_m = (1.0/rho)*r;
/////////////////////////
// Initial coeffs
/////////////////////////
delta_m_minus_1 = 1.0;
c_m_minus_1 = 1.0;
c_m = 1.0;
s_m_minus_1 = 0.0;
s_m = 0.0;
/////////////////////////
// Set up convergence check
/////////////////////////
tau2 = mod2r;
target2 = mod2b * Tolerance*Tolerance;
for(int iter = 0 ; iter < MaxIterations; iter++){
/////////////////////////
// \delta_m = (v_m, \gamma_5 v_m)
/////////////////////////
delta_m = innerG5ProductReal(v_m,v_m);
std::cout << "QuasiMinimalResidual delta_m "<< delta_m<<std::endl;
/////////////////////////
// tmp = A v_m
/////////////////////////
LinOp.Op(v_m,tmp);
/////////////////////////
// \alpha = (v_m, \gamma_5 temp) / \delta_m
/////////////////////////
alpha = innerG5ProductReal(v_m,tmp);
alpha = alpha/delta_m ;
std::cout << "QuasiMinimalResidual alpha "<< alpha<<std::endl;
/////////////////////////
// \beta = \rho \delta_m / \delta_{m-1}
/////////////////////////
beta = rho * delta_m / delta_m_minus_1;
std::cout << "QuasiMinimalResidual beta "<< beta<<std::endl;
/////////////////////////
// \tilde{v}_{m+1} = temp - \alpha v_m - \beta v_{m-1}
/////////////////////////
v_m_plus_1 = tmp - alpha*v_m - beta*v_m_minus_1;
///////////////////////////////
// \rho = || \tilde{v}_{m+1} ||
///////////////////////////////
rho = sqrt( norm2(v_m_plus_1) );
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
///////////////////////////////
// v_{m+1} = \tilde{v}_{m+1}
///////////////////////////////
v_m_plus_1 = (1.0 / rho) * v_m_plus_1;
////////////////////////////////
// QMR recurrence coefficients.
////////////////////////////////
theta = s_m_minus_1 * beta;
gamma = c_m_minus_1 * beta;
epsilon = c_m * gamma + s_m * alpha;
xi = -s_m * gamma + c_m * alpha;
nu = sqrt( xi*xi + rho*rho );
c_m_plus_1 = fabs(xi) / nu;
if ( xi == 0.0 ) {
s_m_plus_1 = 1.0;
} else {
s_m_plus_1 = c_m_plus_1 * rho / xi;
}
chi = c_m_plus_1 * xi + s_m_plus_1 * rho;
std::cout << "QuasiMinimalResidual coeffs "<< theta <<" "<<gamma<<" "<< epsilon<<" "<< xi<<" "<< nu<<std::endl;
std::cout << "QuasiMinimalResidual coeffs "<< chi <<std::endl;
////////////////////////////////
//p_m=(v_m - \epsilon p_{m-1} - \theta p_{m-2}) / \chi
////////////////////////////////
p_m = (1.0/chi) * v_m - (epsilon/chi) * p_m_minus_1 - (theta/chi) * p_m_minus_2;
////////////////////////////////////////////////////////////////
// \psi = \psi + c_{m+1} \mu p_m
////////////////////////////////////////////////////////////////
x = x + ( c_m_plus_1 * mu ) * p_m;
////////////////////////////////////////
//
////////////////////////////////////////
mu = -s_m_plus_1 * mu;
delta_m_minus_1 = delta_m;
c_m_minus_1 = c_m;
c_m = c_m_plus_1;
s_m_minus_1 = s_m;
s_m = s_m_plus_1;
////////////////////////////////////
// Could use pointer swizzle games.
////////////////////////////////////
v_m_minus_1 = v_m;
v_m = v_m_plus_1;
p_m_minus_2 = p_m_minus_1;
p_m_minus_1 = p_m;
/////////////////////////////////////
// Convergence checks
/////////////////////////////////////
z1 = RealD(iter+1.0);
z2 = z1 + 1.0;
tau2 = tau2 *( z2 / z1 ) * s_m * s_m;
std::cout << " QuasiMinimumResidual iteration "<< iter<<std::endl;
std::cout << " QuasiMinimumResidual tau bound "<< tau2<<std::endl;
// Compute true residual
mod2r = tau2;
if ( 1 || (tau2 < (100.0 * target2)) ) {
LinOp.Op(x,tmp);
r = b - tmp;
mod2r = norm2(r);
std::cout << " QuasiMinimumResidual true residual is "<< mod2r<<std::endl;
}
if ( mod2r < target2 ) {
std::cout << " QuasiMinimumResidual has converged"<<std::endl;
return;
}
}
}
#endif
};
NAMESPACE_END(Grid);

View File

@ -99,10 +99,13 @@ namespace Grid {
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
public:
SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false) :
_HermitianRBSolver(HermitianRBSolver),
useSolnAsInitGuess(_solnAsInitGuess)
{
CBfactorise = 0;
subtractGuess(initSubGuess);
@ -129,6 +132,31 @@ namespace Grid {
(*this)(_Matrix,in,out,guess);
}
void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
}
// James can write his own deflated guesser
// with optimised code for the inner products
// RedBlackSolveSplitGrid();
// RedBlackSolve(_Matrix,src_o,sol_o);
void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++) {
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
}
}
template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{
@ -147,20 +175,29 @@ namespace Grid {
////////////////////////////////////////////////
// Prepare RedBlack source
////////////////////////////////////////////////
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
RedBlackSource(_Matrix,in,src_o);
// for(int b=0;b<nblock;b++){
// RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
// }
////////////////////////////////////////////////
// Make the guesses
////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
guess(src_o[b],sol_o[b]);
if(useSolnAsInitGuess) {
for(int b=0;b<nblock;b++){
pickCheckerboard(Odd, sol_o[b], out[b]);
}
} else {
guess(src_o, sol_o);
}
if ( subGuess ) {
guess_save[b] = sol_o[b];
}
if ( subGuess ) {
for(int b=0;b<nblock;b++){
guess_save[b] = sol_o[b];
}
}
//////////////////////////////////////////////////////////////
// Call the block solver
@ -216,8 +253,11 @@ namespace Grid {
////////////////////////////////
// Construct the guess
////////////////////////////////
Field tmp(grid);
guess(src_o,sol_o);
if(useSolnAsInitGuess) {
pickCheckerboard(Odd, sol_o, out);
} else {
guess(src_o,sol_o);
}
Field guess_save(grid);
guess_save = sol_o;
@ -251,7 +291,7 @@ namespace Grid {
}
/////////////////////////////////////////////////////////////
// Override in derived. Not virtual as template methods
// Override in derived.
/////////////////////////////////////////////////////////////
virtual void RedBlackSource (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) =0;
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) =0;
@ -264,8 +304,9 @@ namespace Grid {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess)
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess)
{
}
@ -286,9 +327,9 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
}
@ -306,17 +347,17 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd );
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
@ -333,8 +374,9 @@ namespace Grid {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess) {};
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
//////////////////////////////////////////////////////
@ -354,13 +396,13 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@ -374,17 +416,17 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e_i = src_e-tmp; assert( src_e_i.checkerboard ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.checkerboard ==Even);
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd );
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
@ -393,6 +435,151 @@ namespace Grid {
}
};
template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field>
{
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd , src_o, src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
}
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
Field src_e_i(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd );
}
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd);
}
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
{
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
@ -405,8 +592,9 @@ namespace Grid {
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess) {};
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
@ -424,12 +612,12 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@ -450,12 +638,12 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.checkerboard ==Even);
tmp = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.checkerboard ==Even);
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.checkerboard ==Odd );
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@ -469,5 +657,76 @@ namespace Grid {
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field>
{
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd , src_o, src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
}
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field sol_o_i(grid);
Field tmp(grid);
Field sol_e(grid);
////////////////////////////////////////////////
// MooeeInv due to pecond
////////////////////////////////////////////////
_Matrix.MooeeInv(sol_o, tmp);
sol_o_i = tmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even );
tmp = src_e - tmp; assert( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd );
};
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{
NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o);
};
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
{
NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o);
}
};
}
#endif

View File

@ -0,0 +1,608 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/Aggregates.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x)
{
// return std::pow(x,-4);
// return std::pow(x,-3);
return std::pow(x,-5);
}
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
constexpr int Nbasis(void) { return nbasis; };
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspaceRandom(GridParallelRNG &RNG) {
int nn=nbasis;
RealD scale;
FineField noise(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
subspace[b] = noise;
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<3;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
<<ordermin<<" step "<<orderstep
<<" lo"<<filterlo<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
ComplexD ip;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
assert(b==nn);
}
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
// Refine
Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
noise = Mn;
PowerLaw(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
// normalise
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevNew(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double hi
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
//#opt2(x) = acheb(x,3,90,300)* acheb(x,1,90,50) * acheb(x,0.5,90,200) * acheb(x,0.05,90,400) * acheb(x,0.01,90,1500)
/*266
Chebyshev<FineField> Cheb1(3.0,hi,300);
Chebyshev<FineField> Cheb2(1.0,hi,50);
Chebyshev<FineField> Cheb3(0.5,hi,300);
Chebyshev<FineField> Cheb4(0.05,hi,500);
Chebyshev<FineField> Cheb5(0.01,hi,2000);
*/
/* 242 */
/*
Chebyshev<FineField> Cheb3(0.1,hi,300);
Chebyshev<FineField> Cheb2(0.02,hi,1000);
Chebyshev<FineField> Cheb1(0.003,hi,2000);
8?
*/
/* How many??
*/
Chebyshev<FineField> Cheb2(0.001,hi,2500); // 169 iters on HDCG after refine
Chebyshev<FineField> Cheb1(0.02,hi,600);
// Chebyshev<FineField> Cheb2(0.001,hi,1500);
// Chebyshev<FineField> Cheb1(0.02,hi,600);
Cheb1(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb1 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
Cheb2(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb2 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb3(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb3 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb4(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb4 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb5(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb5 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
subspace[b] = noise;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<< " norm " << norm2(noise)<<std::endl;
}
}
virtual void CreateSubspaceMultishift(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double Lo,double tol,int maxit)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
// Filter
// [ 1/6(x+Lo) - 1/2(x+2Lo) + 1/2(x+3Lo) -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
//
// 1/(x+Lo) - 1/(x+2 Lo)
double epsilon = Lo/3;
std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
std::vector<RealD> tols({tol,tol,tol,tol});
std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
MultiShiftFunction msf(4,0.0,95.0);
std::cout << "msf constructed "<<std::endl;
msf.poles=shifts;
msf.residues=alpha;
msf.tolerances=tols;
msf.norm=0.0;
msf.order=alpha.size();
ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
MSCG(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
double Lo,double tol,int maxit)
{
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b++)
{
ConjugateGradient<FineField> CGsloppy(tol,maxit,false);
ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,Lo);
tmp=Zero();
CGsloppy(hermop,subspace[b],tmp);
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b]=tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspaceHDCG(LinearOperatorBase<FineField> &hermop,
TwoLevelADEF2mrhs<FineField,CoarseVector> & theHDCG,
int nrhs)
{
std::vector<FineField> src_mrhs(nrhs,FineGrid);
std::vector<FineField> res_mrhs(nrhs,FineGrid);
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b+=nrhs)
{
tmp = subspace[b];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b] =tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "before filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
src_mrhs[r] = subspace[b+r];
}
for(int r=0;r<nrhs;r++){
res_mrhs[r] = Zero();
}
theHDCG(src_mrhs,res_mrhs);
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
tmp = res_mrhs[r];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b+r]=tmp;
}
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "after filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,837 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/CoarsenedMatrix.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
NAMESPACE_BEGIN(Grid);
template<class vobj,class CComplex>
inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
const Lattice<decltype(innerProduct(vobj(),vobj()))> &FineMask,
const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY)
{
typedef decltype(innerProduct(vobj(),vobj())) dotp;
GridBase *coarse(CoarseInner.Grid());
GridBase *fine (fineX.Grid());
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> fine_inner_msk(fine);
// Multiply could be fused with innerProduct
// Single block sum kernel could do both masks.
fine_inner = localInnerProduct(fineX,fineY);
mult(fine_inner_msk, fine_inner,FineMask);
blockSum(CoarseInner,fine_inner_msk);
}
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<CComplex > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef CoarseVector FermionField;
// enrich interface, use default implementation as in FermionOperator ///////
void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
////////////////////
// Data members
////////////////////
Geometry geom;
GridBase * _grid;
GridBase* _cbgrid;
int hermitian;
CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil;
CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven;
CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd;
std::vector<CoarseMatrix> A;
std::vector<CoarseMatrix> Aeven;
std::vector<CoarseMatrix> Aodd;
CoarseMatrix AselfInv;
CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd;
deviceVector<RealD> dag_factor;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
GridBase * RedBlackGrid() { return _cbgrid; };
int ConstEE() { return 0; }
void M (const CoarseVector &in, CoarseVector &out)
{
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int osites=Grid()->oSites();
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int point=0;point<npoint;point++){
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
};
void Mdag (const CoarseVector &in, CoarseVector &out)
{
if(hermitian) {
// corresponds to Petrov-Galerkin coarsening
return M(in,out);
} else {
// corresponds to Galerkin coarsening
return MdagNonHermitian(in, out);
}
};
void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
{
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int osites=Grid()->oSites();
deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) {
acceleratorPut(points[p],geom.points_dagger[p]);
}
auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0];
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
void MdirComms(const CoarseVector &in)
{
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
}
void MdirCalc(const CoarseVector &in, CoarseVector &out, int point)
{
conformable(_grid,in.Grid());
conformable(_grid,out.Grid());
out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
autoView( Stencil_v , Stencil, AcceleratorRead);
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{
this->MdirComms(in);
int ndir=geom.npoint-1;
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
std::cout <<"MdirAll out size "<< out.size()<<std::endl;
std::cout <<"MdirAll ndir "<< ndir<<std::endl;
assert(0);
}
for(int p=0;p<ndir;p++){
MdirCalc(in,out[p],p);
}
};
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
this->MdirComms(in);
MdirCalc(in,out,geom.point(dir,disp));
};
void Mdiag(const CoarseVector &in, CoarseVector &out)
{
int point=geom.npoint-1;
MdirCalc(in, out, point); // No comms
};
void Mooee(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerNo, InverseNo);
}
void MooeeInv(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerNo, InverseYes);
}
void MooeeDag(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerYes, InverseNo);
}
void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerYes, InverseYes);
}
void Meooe(const CoarseVector &in, CoarseVector &out) {
if(in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
void MeooeDag(const CoarseVector &in, CoarseVector &out) {
if(in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, A, in, out, dag);
}
void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag);
}
void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag);
}
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard();
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
DselfInternal(StencilOdd, *Aself, in, out, dag);
} else {
Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
DselfInternal(StencilEven, *Aself, in, out, dag);
}
} else {
Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag);
}
assert(Aself != nullptr);
}
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
const CoarseVector &in, CoarseVector &out, int dag) {
int point = geom.npoint-1;
autoView( out_v, out, AcceleratorWrite);
autoView( in_v, in, AcceleratorRead);
autoView( st_v, st, AcceleratorRead);
autoView( a_v, a, AcceleratorRead);
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
RealD* dag_factor_p = &dag_factor[0];
if(dag) {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
} else {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
}
}
void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a,
const CoarseVector &in, CoarseVector &out, int dag) {
SimpleCompressor<siteVector> compressor;
st.HaloExchange(in,compressor);
autoView( in_v, in, AcceleratorRead);
autoView( out_v, out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
typedef LatticeView<Cobj> Aview;
// determine in what order we need the points
int npoint = geom.npoint-1;
deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) {
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0];
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
RealD* dag_factor_p = &dag_factor[0];
if(dag) {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
} else {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
}
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
_grid(&CoarseGrid),
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,_cbgrid),
Aodd(geom.npoint,_cbgrid),
AselfInv(&CoarseGrid),
AselfInvEven(_cbgrid),
AselfInvOdd(_cbgrid),
dag_factor(nbasis*nbasis)
{
fillFactor();
};
CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) :
_grid(&CoarseGrid),
_cbgrid(&CoarseRBGrid),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,&CoarseRBGrid),
Aodd(geom.npoint,&CoarseRBGrid),
AselfInv(&CoarseGrid),
AselfInvEven(&CoarseRBGrid),
AselfInvOdd(&CoarseRBGrid),
dag_factor(nbasis*nbasis)
{
fillFactor();
};
void fillFactor() {
Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
if(!hermitian) {
const int nb = nbasis/2;
dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
}
// GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, {
int j = i/nbasis;
int k = i%nbasis;
h_dag_factor[i] = dag_factor_eigen(j, k);
});
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
}
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
typedef typename Fobj::scalar_type scalar_type;
std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
std::vector<FineComplexField> masks(geom.npoint,FineGrid);
FineComplexField imask(FineGrid); // contributions from within this block
FineComplexField omask(FineGrid); // contributions from outwith this block
FineComplexField evenmask(FineGrid);
FineComplexField oddmask(FineGrid);
FineField phi(FineGrid);
FineField tmp(FineGrid);
FineField zz(FineGrid); zz=Zero();
FineField Mphi(FineGrid);
FineField Mphie(FineGrid);
FineField Mphio(FineGrid);
std::vector<FineField> Mphi_p(geom.npoint,FineGrid);
Lattice<iScalar<vInteger> > coor (FineGrid);
Lattice<iScalar<vInteger> > bcoor(FineGrid);
Lattice<iScalar<vInteger> > bcb (FineGrid); bcb = Zero();
CoarseVector iProj(Grid());
CoarseVector oProj(Grid());
CoarseVector SelfProj(Grid());
CoarseComplexField iZProj(Grid());
CoarseComplexField oZProj(Grid());
CoarseScalar InnerProd(Grid());
std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
// Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace);
// Compute the matrix elements of linop between this orthonormal
// set of vectors.
std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
int self_stencil=-1;
for(int p=0;p<geom.npoint;p++)
{
int dir = geom.directions[p];
int disp = geom.displacements[p];
A[p]=Zero();
if( geom.displacements[p]==0){
self_stencil=p;
}
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
LatticeCoordinate(coor,dir);
///////////////////////////////////////////////////////
// Work out even and odd block checkerboarding for fast diagonal term
///////////////////////////////////////////////////////
if ( disp==1 ) {
bcb = bcb + div(coor,block);
}
if ( disp==0 ) {
masks[p]= Zero();
} else if ( disp==1 ) {
masks[p] = where(mod(coor,block)==(block-1),one,zero);
} else if ( disp==-1 ) {
masks[p] = where(mod(coor,block)==(Integer)0,one,zero);
}
}
evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask;
assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){
phi=Subspace.subspace[i];
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
linop.OpDirAll(phi,Mphi_p);
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
for(int p=0;p<geom.npoint;p++){
Mphi = Mphi_p[p];
int dir = geom.directions[p];
int disp = geom.displacements[p];
if ( (disp==-1) || (!hermitian ) ) {
////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell
////////////////////////////////////////////////////////////////////////
omask = masks[p];
imask = one-omask;
for(int j=0;j<nbasis;j++){
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
autoView( iZProj_v , iZProj, AcceleratorRead) ;
autoView( oZProj_v , oZProj, AcceleratorRead) ;
autoView( A_p , A[p], AcceleratorWrite);
autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
if ( hermitian && (disp==-1) ) {
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
int dirp = geom.directions[pp];
int dispp = geom.displacements[pp];
if ( (dirp==dir) && (dispp==1) ){
auto sft = conjugate(Cshift(oZProj,dir,1));
autoView( sft_v , sft , AcceleratorWrite);
autoView( A_pp , A[pp], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
}
}
}
}
}
}
///////////////////////////////////////////
// Faster alternate self coupling.. use hermiticity to save 2x
///////////////////////////////////////////
{
mult(tmp,phi,evenmask); linop.Op(tmp,Mphie);
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
{
autoView( tmp_ , tmp, AcceleratorWrite);
autoView( evenmask_ , evenmask, AcceleratorRead);
autoView( oddmask_ , oddmask, AcceleratorRead);
autoView( Mphie_ , Mphie, AcceleratorRead);
autoView( Mphio_ , Mphio, AcceleratorRead);
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
});
}
blockProject(SelfProj,tmp,Subspace.subspace);
autoView( SelfProj_ , SelfProj, AcceleratorRead);
autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
for(int j=0;j<nbasis;j++){
coalescedWrite(A_self[ss](j,i), SelfProj_(ss)(j));
}
});
}
}
if(hermitian) {
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
}
InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
}
void InvertSelfStencilLink() {
std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
int localVolume = Grid()->lSites();
typedef typename Cobj::scalar_object scalar_object;
autoView(Aself_v, A[geom.npoint-1], CpuRead);
autoView(AselfInv_v, AselfInv, CpuWrite);
thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
scalar_object selfLink = Zero();
scalar_object selfLinkInv = Zero();
Coordinate lcoor;
Grid()->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(selfLink, Aself_v, lcoor);
for (int i = 0; i < nbasis; ++i)
for (int j = 0; j < nbasis; ++j)
selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
selfLinkInvEigen = selfLinkEigen.inverse();
for(int i = 0; i < nbasis; ++i)
for(int j = 0; j < nbasis; ++j)
selfLinkInv(i, j) = selfLinkInvEigen(i, j);
pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
});
}
void FillHalfCbs() {
std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
for(int p = 0; p < geom.npoint; ++p) {
pickCheckerboard(Even, Aeven[p], A[p]);
pickCheckerboard(Odd, Aodd[p], A[p]);
}
pickCheckerboard(Even, AselfInvEven, AselfInv);
pickCheckerboard(Odd, AselfInvOdd, AselfInv);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,619 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef Lattice<CComplex > FineComplexField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
int hermitian;
GridBase * _FineGrid;
GridCartesian * _CoarseGrid;
NonLocalStencilGeometry &geom;
PaddedCell Cell;
GeneralLocalStencil Stencil;
std::vector<CoarseMatrix> _A;
std::vector<CoarseMatrix> _Adag;
std::vector<CoarseVector> MultTemporaries;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
GridBase * FineGrid(void) { return _FineGrid; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
/* void ShiftMatrix(RealD shift)
{
int Nd=_FineGrid->Nd();
Coordinate zero_shift(Nd,0);
for(int p=0;p<geom.npoint;p++){
if ( zero_shift==geom.shifts[p] ) {
_A[p] = _A[p]+shift;
// _Adag[p] = _Adag[p]+shift;
}
}
}
void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
{
int nfound=0;
std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
for(int p=0;p<geom.npoint;p++){
for(int pp=0;pp<CopyMe.geom.npoint;pp++){
// Search for the same relative shift
// Avoids brutal handling of Grid pointers
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
// _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
nfound++;
}
}
}
assert(nfound==geom.npoint);
ExchangeCoarseLinks();
}
*/
GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
: geom(_geom),
_FineGrid(FineGrid),
_CoarseGrid(CoarseGrid),
hermitian(1),
Cell(_geom.Depth(),_CoarseGrid),
Stencil(Cell.grids.back(),geom.shifts)
{
{
int npoint = _geom.npoint;
}
_A.resize(geom.npoint,CoarseGrid);
// _Adag.resize(geom.npoint,CoarseGrid);
}
void M (const CoarseVector &in, CoarseVector &out)
{
Mult(_A,in,out);
}
void Mdag (const CoarseVector &in, CoarseVector &out)
{
assert(hermitian);
Mult(_A,in,out);
// if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out);
}
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
{
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
RealD tmult2=0;
ttot=-usecond();
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
CoarseVector tin=in;
texch-=usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin);
texch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t osites=pin.Grid()->oSites();
RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+ 2.0*osites*sizeof(siteVector)*npoint;
{
tviews-=usecond();
autoView( in_v , pin, AcceleratorRead);
autoView( out_v , pout, AcceleratorWriteDiscard);
autoView( Stencil_v , Stencil, AcceleratorRead);
tviews+=usecond();
// Static and prereserve to keep UVM region live and not resized across multiple calls
ttemps-=usecond();
MultTemporaries.resize(npoint,pin.Grid());
ttemps+=usecond();
std::vector<Aview> AcceleratorViewContainer_h;
std::vector<Vview> AcceleratorVecViewContainer_h;
tviews-=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h.push_back( A[p].View(AcceleratorRead));
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
}
tviews+=usecond();
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
auto Aview_p = &AcceleratorViewContainer[0];
auto Vview_p = &AcceleratorVecViewContainer[0];
tcopy-=usecond();
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
tcopy+=usecond();
tmult-=usecond();
accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int32_t ss = spb/(nbasis*npoint);
int32_t bp = spb%(nbasis*npoint);
int32_t point= bp/nbasis;
int32_t b = bp%nbasis;
auto SE = Stencil_v.GetEntry(point,ss);
auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
for(int bb=1;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
}
coalescedWrite(Vview_p[point][ss](b),res);
});
tmult2-=usecond();
accelerator_for(sb, osites*nbasis, Nsimd, {
int ss = sb/nbasis;
int b = sb%nbasis;
auto res = coalescedRead(Vview_p[0][ss](b));
for(int point=1;point<npoint;point++){
res = res + coalescedRead(Vview_p[point][ss](b));
}
coalescedWrite(out_v[ss](b),res);
});
tmult2+=usecond();
tmult+=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h[p].ViewClose();
AcceleratorVecViewContainer_h[p].ViewClose();
}
}
text-=usecond();
out = Cell.Extract(pout);
text+=usecond();
ttot+=usecond();
std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogPerformance<<" of which mult2 "<<tmult2<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
// std::cout << GridLogPerformance<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
void PopulateAdag(void)
{
for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
Coordinate bcoor;
CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
for(int p=0;p<geom.npoint;p++){
Coordinate scoor = bcoor;
for(int mu=0;mu<bcoor.size();mu++){
int L = CoarseGrid()->GlobalDimensions()[mu];
scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
}
// Flip to poke/peekLocalSite and not too bad
auto link = peekSite(_A[p],scoor);
int pp = geom.Reverse(p);
pokeSite(adj(link),_Adag[pp],bcoor);
}
}
}
/////////////////////////////////////////////////////////////
//
// A) Only reduced flops option is to use a padded cell of depth 4
// and apply MpcDagMpc in the padded cell.
//
// Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
// With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
// Cost is 81x more, same as stencil size.
//
// But: can eliminate comms and do as local dirichlet.
//
// Local exchange gauge field once.
// Apply to all vectors, local only computation.
// Must exchange ghost subcells in reverse process of PaddedCell to take inner products
//
// B) Can reduce cost: pad by 1, apply Deo (4^4+6^4+8^4+8^4 )/ (4x 4^4)
// pad by 2, apply Doe
// pad by 3, apply Deo
// then break out 8x directions; cost is ~10x MpcDagMpc per vector
//
// => almost factor of 10 in setup cost, excluding data rearrangement
//
// Intermediates -- ignore the corner terms, leave approximate and force Hermitian
// Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
/////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////
// BFM HDCG style approach: Solve a system of equations to get Aij
//////////////////////////////////////////////////////////
/*
* Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
*
* conj(phases[block]) proj[k][ block*Nvec+j ] = \sum_ball e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} >
* = \sum_ball e^{iqk.delta} A_ji
*
* Must invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*/
#if 0
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
CoarseVector coarseInner(CoarseGrid());
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
tphase-=usecond();
CoarseComplexField coor(CoarseGrid());
CoarseComplexField pha(CoarseGrid()); pha=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha =exp(pha*ci);
phaV=Zero();
blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
tphase+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
CoarseVector coarseInner(CoarseGrid());
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
tphase=-usecond();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid());
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
// std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
}
for(int p=0;p<geom.npoint;p++){
std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#endif
void ExchangeCoarseLinks(void){
for(int p=0;p<geom.npoint;p++){
_A[p] = Cell.ExchangePeriodic(_A[p]);
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
}
}
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,729 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef typename CComplex::scalar_object SComplex;
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef iVector<SComplex,nbasis > calcVector;
typedef iMatrix<SComplex,nbasis > calcMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef Lattice<CComplex > FineComplexField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
GridCartesian * _CoarseGridMulti;
NonLocalStencilGeometry geom;
NonLocalStencilGeometry geom_srhs;
PaddedCell Cell;
GeneralLocalStencil Stencil;
deviceVector<calcVector> BLAS_B;
deviceVector<calcVector> BLAS_C;
std::vector<deviceVector<calcMatrix> > BLAS_A;
std::vector<deviceVector<ComplexD *> > BLAS_AP;
std::vector<deviceVector<ComplexD *> > BLAS_BP;
deviceVector<ComplexD *> BLAS_CP;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
// Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A)
{
assert(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]);
}
void GetMatrix (int p,CoarseMatrix & A)
{
assert(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]);
}
void CopyMatrix (GeneralCoarseOp &_Op)
{
for(int p=0;p<geom.npoint;p++){
auto Aup = _Op.Cell.Extract(_Op._A[p]);
//Unpadded
GridtoBLAS(Aup,BLAS_A[p]);
}
}
/*
void CheckMatrix (GeneralCoarseOp &_Op)
{
std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
for(int p=0;p<geom.npoint;p++){
//Unpadded
auto Aup = _Op.Cell.Extract(_Op._A[p]);
auto Ack = Aup;
BLAStoGrid(Ack,BLAS_A[p]);
std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
}
std::cout <<"************* "<<std::endl;
}
*/
MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
_CoarseGridMulti(CoarseGridMulti),
geom_srhs(_geom),
geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
Cell(geom.Depth(),_CoarseGridMulti),
Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
{
int32_t padded_sites = Cell.grids.back()->lSites();
int32_t unpadded_sites = CoarseGridMulti->lSites();
int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
int32_t orhs = nrhs/CComplex::Nsimd();
padded_sites = padded_sites/nrhs;
unpadded_sites = unpadded_sites/nrhs;
/////////////////////////////////////////////////
// Device data vector storage
/////////////////////////////////////////////////
BLAS_A.resize(geom.npoint);
for(int p=0;p<geom.npoint;p++){
BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
}
BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
BLAS_AP.resize(geom.npoint);
BLAS_BP.resize(geom.npoint);
for(int p=0;p<geom.npoint;p++){
BLAS_AP[p].resize(unpadded_sites);
BLAS_BP[p].resize(unpadded_sites);
}
BLAS_CP.resize(unpadded_sites);
/////////////////////////////////////////////////
// Pointers to data
/////////////////////////////////////////////////
// Site identity mapping for A
for(int p=0;p<geom.npoint;p++){
for(int ss=0;ss<unpadded_sites;ss++){
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
acceleratorPut(BLAS_AP[p][ss],ptr);
}
}
// Site identity mapping for C
for(int ss=0;ss<unpadded_sites;ss++){
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
acceleratorPut(BLAS_CP[ss],ptr);
}
// Neighbour table is more complicated
int32_t j=0; // Interior point counter (unpadded)
for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
int ghost_zone=0;
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
}
}
if( ghost_zone==0) {
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
assert(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
}
j++;
}
}
assert(j==unpadded_sites);
}
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid();
assert(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension;
to.resize(Fg->lSites());
Coordinate LocalLatt = Fg->LocalDimensions();
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
////////////////////////////////////////////////////////////////////////////////////////////////
// do the index calc on the GPU
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
autoView(from_v,from,AcceleratorRead);
auto to_v = &to[0];
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{
Coordinate from_coor, base;
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
for(int i=0;i<nd;i++){
from_coor[i] = base[i];
}
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx];
scalar_type* to = (scalar_type *)&to_v[idx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
to[w] = stmp;
}
});
}
template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid();
assert(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension;
assert(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
////////////////////////////////////////////////////////////////////////////////////////////////
// do the index calc on the GPU
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
autoView(to_v,grid,AcceleratorWrite);
auto from_v = &in[0];
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{
Coordinate to_coor, base;
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
for(int i=0;i<nd;i++){
to_coor[i] = base[i];
}
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type* from = (scalar_type *)&from_v[idx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp=from[w];
putlane(to[w], stmp, to_lane);
}
});
}
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace,
GridBase *CoarseGrid)
{
#if 0
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
phaV = phaF[p]*Subspace.subspace[i];
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
linop.Op(phaV,MphaV);
// Fixme, could use batched block projector here
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
for(int k=0;k<npoint;k++){
FT = Zero();
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
/*
Grid : Message : 11698.730546 s : CoarsenOperator eigen 1334 us
Grid : Message : 11698.730563 s : CoarsenOperator phase 34729 us
Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
Grid : Message : 11698.730566 s : CoarsenOperator mat 127890998 us
Grid : Message : 11698.730567 s : CoarsenOperator proj 515840840 us
Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us
Takes 600s to compute matrix elements, DOMINATED by the block project.
Easy to speed up with the batched block project.
Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
// Block project below taks to 240s
Grid : Message : 328.193418 s : CoarsenOperator phase 38338 us
Grid : Message : 328.193434 s : CoarsenOperator phaseBZ 1711226 us
Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
//Grid : Message : 328.193438 s : CoarsenOperator proj 1181154 us <-- this is mistimed
//Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us <-- Cut this ~10x if lucky by loop fusion
*/
#else
RealD tproj=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
MultiRHSBlockProject<Lattice<Fobj> > Projector;
Projector.Allocate(nbasis,grid,CoarseGrid);
Projector.ImportBasis(Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
tphase=-usecond();
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
// Count use small chunks than npoint == 81 and save memory
int batch = 9;
std::vector<FineField> _MphaV(batch,grid);
std::vector<CoarseVector> TmpProj(batch,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
// std::cout << GridLogMessage << " phasing the fine vector "<<std::endl;
// Fixme : do this in batches
for(int p=0;p<npoint;p+=batch){ // Loop over momenta in npoint
for(int b=0;b<MIN(batch,npoint-p);b++){
tphaseBZ-=usecond();
phaV = phaF[p+b]*Subspace.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
// Memory footprint was an issue
tmat-=usecond();
linop.Op(phaV,MphaV);
_MphaV[b] = MphaV;
tmat+=usecond();
}
// std::cout << GridLogMessage << " Calling block project "<<std::endl;
tproj-=usecond();
Projector.blockProject(_MphaV,TmpProj);
tproj+=usecond();
// std::cout << GridLogMessage << " conj phasing the coarse vectors "<<std::endl;
for(int b=0;b<MIN(batch,npoint-p);b++){
ComputeProj[p+b] = conjugate(pha[p+b])*TmpProj[b];
}
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
// std::cout << GridLogMessage << " Starting FT inv "<<std::endl;
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT = Zero();
// 81 kernel calls as many ComputeProj vectors
// Could fuse with a vector of views, but ugly
// Could unroll the expression and run fewer kernels -- much more attractive
// Could also do non blocking.
#if 0
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#else
const int radix = 9;
int ll;
for(ll=0;ll+radix-1<npoint;ll+=radix){
// When ll = npoint-radix, ll+radix-1 = npoint-1, and we do it all.
FT = FT
+ invMkl(ll+0,k)*ComputeProj[ll+0]
+ invMkl(ll+1,k)*ComputeProj[ll+1]
+ invMkl(ll+2,k)*ComputeProj[ll+2]
+ invMkl(ll+3,k)*ComputeProj[ll+3]
+ invMkl(ll+4,k)*ComputeProj[ll+4]
+ invMkl(ll+5,k)*ComputeProj[ll+5]
+ invMkl(ll+6,k)*ComputeProj[ll+6]
+ invMkl(ll+7,k)*ComputeProj[ll+7]
+ invMkl(ll+8,k)*ComputeProj[ll+8];
}
for(int l=ll;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#endif
// 1 kernel call -- must be cheaper
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
// std::cout << GridLogMessage << " Calling GridtoBLAS "<<std::endl;
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
#endif
}
void Mdag(const CoarseVector &in, CoarseVector &out)
{
this->M(in,out);
}
void M (const CoarseVector &in, CoarseVector &out)
{
// std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
RealD t_tot;
RealD t_exch;
RealD t_GtoB;
RealD t_BtoG;
RealD t_mult;
t_tot=-usecond();
CoarseVector tin=in;
t_exch=-usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
t_exch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef calcMatrix* Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
assert(nrhs>=1);
RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded
int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+ 2.0*osites*sizeof(siteVector)*npoint;
t_GtoB=-usecond();
GridtoBLAS(pin,BLAS_B);
t_GtoB+=usecond();
GridBLAS BLAS;
t_mult=-usecond();
for(int p=0;p<geom.npoint;p++){
RealD c = 1.0;
if (p==0) c = 0.0;
ComplexD beta(c);
BLAS.gemmBatched(nbasis,nrhs,nbasis,
ComplexD(1.0),
BLAS_AP[p],
BLAS_BP[p],
ComplexD(c),
BLAS_CP);
}
BLAS.synchronise();
t_mult+=usecond();
t_BtoG=-usecond();
BLAStoGrid(out,BLAS_C);
t_BtoG+=usecond();
t_tot+=usecond();
/*
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
*/
// std::cout << GridLogMessage<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,238 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////
// Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
/////////////////////////////////////////////////////////////////
// Less local equivalent of Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class NonLocalStencilGeometry {
public:
// int depth;
int skip;
int hops;
int npoint;
std::vector<Coordinate> shifts;
Coordinate stencil_size;
Coordinate stencil_lo;
Coordinate stencil_hi;
GridCartesian *grid;
GridCartesian *Grid() {return grid;};
int Depth(void){return 1;}; // Ghost zone depth
int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
int DimSkip(void){return skip;};
virtual ~NonLocalStencilGeometry() {};
int Reverse(int point)
{
int Nd = Grid()->Nd();
Coordinate shft = shifts[point];
Coordinate rev(Nd);
for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
for(int p=0;p<npoint;p++){
if(rev==shifts[p]){
return p;
}
}
assert(0);
return -1;
}
void BuildShifts(void)
{
this->shifts.resize(0);
int Nd = this->grid->Nd();
int dd = this->DimSkip();
for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
Coordinate sft(Nd,0);
sft[dd+0] = s0;
sft[dd+1] = s1;
sft[dd+2] = s2;
sft[dd+3] = s3;
int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
if(nhops<=this->hops) this->shifts.push_back(sft);
}}}}
this->npoint = this->shifts.size();
std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
}
NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
{
Coordinate latt = grid->GlobalDimensions();
stencil_size.resize(grid->Nd());
stencil_lo.resize(grid->Nd());
stencil_hi.resize(grid->Nd());
for(int d=0;d<grid->Nd();d++){
if ( latt[d] == 1 ) {
stencil_lo[d] = 0;
stencil_hi[d] = 0;
stencil_size[d]= 1;
} else if ( latt[d] == 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 0;
stencil_size[d]= 2;
} else if ( latt[d] > 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 1;
stencil_size[d]= 3;
}
}
this->BuildShifts();
};
};
// Need to worry about red-black now
class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
public:
virtual int DerivedDimSkip(void) { return 0;};
NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
virtual ~NonLocalStencilGeometry4D() {};
};
class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
public:
virtual int DerivedDimSkip(void) { return 1; };
NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1) { };
virtual ~NonLocalStencilGeometry5D() {};
};
/*
* Bunch of different options classes
*/
class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,4)
{
};
};
class NextToNextToNextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,4)
{
};
};
class NextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,2)
{
};
};
class NextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,2)
{
};
};
class NearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,1)
{
};
};
class NearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,1)
{
};
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid/algorithms/multigrid/MultiGrid.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/multigrid/Aggregates.h>
#include <Grid/algorithms/multigrid/Geometry.h>
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>

View File

@ -24,109 +24,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
*************************************************************************************/
/* END LEGAL */
#pragma once
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
namespace Grid {
class PointerCache {
private:
static const int Ncache=8;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl;\
}
#define profilerAllocate(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalAllocated += (bytes);\
s->currentlyAllocated += (bytes);\
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
#define profilerFree(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalFreed += (bytes);\
s->currentlyAllocated -= (bytes);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
template<typename _Tp>
class alignedAllocator {
@ -151,68 +53,34 @@ public:
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n) {
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
MemoryManager::CpuFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { };
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
//////////////////////////////////////////////////////////////////////////////////////
// Unified virtual memory
//////////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class commAllocator {
class uvmAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
@ -222,94 +90,133 @@ public:
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( bcast == (void *) ptr);
#endif
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
MemoryManager::SharedFree((void *)__p,bytes);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Device memory
////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class devAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
devAllocator() throw() { }
devAllocator(const devAllocator&) throw() { }
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
~devAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
MemoryManager::AcceleratorFree((void *)__p,bytes);
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,commAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid
#endif
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
/*
template<class T> class vecView
{
protected:
T * data;
uint64_t size;
ViewMode mode;
void * cpu_ptr;
public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(Vector<T> &refer_to_me,ViewMode _mode)
{
cpu_ptr = &refer_to_me[0];
size = refer_to_me.size();
mode = _mode;
data =(T *) MemoryManager::ViewOpen(cpu_ptr,
size*sizeof(T),
mode,
AdviseDefault);
}
void ViewClose(void)
{ // Inform the manager
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{
vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed
}
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid);

View File

@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@ -0,0 +1,362 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuHuge (1)
#define CpuSmall (2)
#define Acc (3)
#define AccHuge (4)
#define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
DisplayMallinfo();
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
}
void MemoryManager::InitMessage(void) {
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
if (ncache == 0) return ptr;
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
cacheBytes -= entries[v].bytes;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
cacheBytes += bytes;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
cacheBytes -= entries[e].bytes;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,227 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" TOSTRING(__LINE__)
#define AUDIT(a) MemoryManager::Audit(FILE_LINE)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
public:
static void PrintBytes(void);
static void Audit(std::string s);
static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr);
static void Print(void);
static void PrintAll(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,603 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
NAMESPACE_BEGIN(Grid);
#define MAXLINE 512
static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...)
//#define mprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
// Cannot be acclocked. If allocated must be in LRU pool.
//
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
// but there is a weakness where CpuLock entries are attempted for erase
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return;
if (AccCache.cpuLock!=0) return;
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
// EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
} else {
return;
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes,
(uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else {
assert(0);
}
assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
// CPU doesn't need to free space
// if (!AccCache.AccPtr) {
// EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Memory Manager " << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
acceleratorMem();
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}
void MemoryManager::PrintAll(void)
{
Print();
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
void MemoryManager::Audit(std::string s)
{
uint64_t CpuBytes=0;
uint64_t AccBytes=0;
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
CpuBytes+=AccCache.bytes;
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock
<< "\t accLock " << AccCache.accLock
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
}
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
void MemoryManager::PrintState(void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
} else {
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
}
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,31 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
void MemoryManager::Audit(std::string s){};
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::PrintState(void* CpuPtr)
{
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::PrintAll(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@ -1,70 +1,11 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef _OPENMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
@ -74,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
@ -90,7 +31,7 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
@ -106,20 +47,21 @@ std::string sizeString(const size_t bytes)
double count = bytes;
while (count >= 1024 && s < 7)
{
{
s++;
count /= 1024;
}
}
if (count - floor(count) == 0.0)
{
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
}
else
{
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
}
return std::string(buf);
}
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -25,268 +25,268 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H
NAMESPACE_BEGIN(Grid);
namespace Grid{
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// std::vector<int> _processors; // processor grid
// int _processor; // linear processor rank
// std::vector<int> _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// Coordinate _processors; // processor grid
// int _processor; // linear processor rank
// Coordinate _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
public:
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;};
virtual ~GridBase() = default;
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};
// Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
std::vector<int> _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
std::vector<int> _slice_block;// subslice information
std::vector<int> _slice_stride;
std::vector<int> _slice_nblock;
virtual ~GridBase() = default;
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
// Physics Grid information.
Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
Coordinate _gdimensions;// Global dimensions of array after cb removal
Coordinate _ldimensions;// local dimensions of array with processor images removed
Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
Coordinate _ostride; // Outer stride for each dimension
Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int64_t _fsites; // _isites*_osites = product(dimensions).
int64_t _gsites;
Coordinate _slice_block;// subslice information
Coordinate _slice_stride;
Coordinate _slice_nblock;
bool _isCheckerBoarded;
Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded;
int LocallyPeriodic;
Coordinate _checker_dim_mask;
int _checker_dim;
public:
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const std::vector<int> &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0;
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim) =0;
virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0;
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(std::vector<int> &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
virtual int oIndex(Coordinate &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
virtual int iIndex(Coordinate &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(Coordinate &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (Coordinate& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline void iCoorFromIindex(Coordinate &coor,int lane)
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
//
// FIXME:
//
// Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just
// a permute.
//
// Cases: PermuteType == 1,2,4,8
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
//
// Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just
// a permute.
//
// Cases: PermuteType == 1,2,4,8
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const std::vector<int> LocalStarts(void) { return _lstart; };
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
}
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
}
void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
{
std::vector<int> pcoor;
std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
std::vector<int> cblcoor(lcoor);
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor)
{
Coordinate pcoor;
Coordinate lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
Coordinate cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
}
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
std::vector<int> coor(_ndimension);
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor)
{
gcoor.resize(_ndimension);
Coordinate coor(_ndimension);
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
}
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
}
void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,98 +23,102 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H
namespace Grid{
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase {
public:
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(const std::vector<int> &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
int dummy;
// Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim) {
return 0;
}
virtual int CheckerBoard(const Coordinate &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
virtual ~GridCartesian() = default;
virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
void Init(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = false;
_ndimension = dimensions.size();
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
@ -136,30 +140,30 @@ public:
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
@ -167,8 +171,9 @@ public:
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,178 +24,164 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H
NAMESPACE_BEGIN(Grid);
namespace Grid {
static const int CbRed =0;
static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
static const int CbRed =0;
static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase
{
public:
std::vector<int> _checker_dim_mask;
int _checker_dim;
std::vector<int> _checker_board;
// Coordinate _checker_dim_mask;
// int _checker_dim;
std::vector<int> _checker_board;
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
}
virtual int CheckerBoard(const std::vector<int> &site){
int linear=0;
assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
}
return (linear&0x1);
virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
}
virtual int CheckerBoard(const Coordinate &site){
int linear=0;
assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
}
return (linear&0x1);
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
int fulldim =_fdimensions[dim];
shift = (shift+fulldim)%fulldim;
int fulldim =_fdimensions[dim];
shift = (shift+fulldim)%fulldim;
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
std::vector<int> ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
}
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
Coordinate ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift;
if(dim != _checker_dim) return shift;
int ocb=CheckerBoardFromOindex(osite);
int ocb=CheckerBoardFromOindex(osite);
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
}
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
}
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) {
return 1-source_cb;
} else {
return source_cb;
}
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) {
return 1-source_cb;
} else {
return source_cb;
}
};
} else {
return source_cb;
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
std::vector<int> checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
};
virtual ~GridRedBlackCartesian() = default;
#if 0
////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not
// need direct creation of redblack without a full grid to base on
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
Coordinate checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack grid
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)
{
std::vector<int> checker_dim_mask(dimensions.size(),1);
int checker_dim = 0;
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const Coordinate &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
virtual ~GridRedBlackCartesian() = default;
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim)
{
void Init(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const Coordinate &checker_dim_mask,
int checker_dim)
{
_isCheckerBoarded = true;
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask = checker_dim_mask;
_checker_dim_mask = checker_dim_mask;
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
@ -203,11 +189,11 @@ public:
_gsites = _gsites * _gdimensions[d];
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
@ -222,42 +208,42 @@ public:
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
@ -266,55 +252,55 @@ public:
block = block * _rdimensions[d];
}
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for (int d = 0; d < _ndimension; d++)
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for (int d = 0; d < _ndimension; d++)
{
rvol = rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++)
_checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++)
{
_checker_board[osite] = CheckerBoardFromOindex(osite);
}
};
};
protected:
virtual int oIndex(std::vector<int> &coor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
protected:
virtual int oIndex(Coordinate &coor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
}
return idx;
};
return idx;
};
virtual int iIndex(std::vector<int> &lcoor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
virtual int iIndex(Coordinate &lcoor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
else
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
}
return idx;
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,11 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
#include <Grid/util/Coordinate.h>
#include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h>

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,15 +23,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
bool Stencil_force_mpi = true;
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
@ -47,30 +49,42 @@ int CartesianCommunicator::Dimensions(void) { return
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const Coordinate & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
}
NAMESPACE_END(Grid);

View File

@ -1,5 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H
@ -34,7 +33,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
#define NVLINK_GET
NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
class CartesianCommunicator : public SharedMemory {
@ -52,10 +55,11 @@ public:
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processor_coor; // linear processor coordinate
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
@ -69,101 +73,138 @@ public:
// Constructors to sub-divide a parent communicator
// and default to comm world
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in);
CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const Coordinate &pdimensions_in);
virtual ~CartesianCommunicator();
private:
private:
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
public:
void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base);
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
int RankFromProcessorCoor(Coordinate &coor);
void ProcessorCoorFromRank(int rank,Coordinate &coor);
int Dimensions(void) ;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ShmGrid(void) { return _shm_processors; } ;
const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
static void BarrierWorld(void);
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalMax(RealD &);
void GlobalMax(RealF &);
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &);
void GlobalSumVector(uint64_t*,int N);
void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c);
void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
scalar_type * ptr = (scalar_type *)& o; // Safe alias
GlobalSumVector(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir);
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,
int recv_from_rank,int do_recv,
int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,
int bytes,int dir);
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@ -197,11 +238,12 @@ public:
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));
};
{
Broadcast(root,(void *)&data,sizeof(data));
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,6 +1,6 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
@ -23,19 +23,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
// First initialise of comms system
////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char ***argv)
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
@ -43,20 +44,35 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
// other comms schemes are ok
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
#else
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0);
}
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0);
}
}
// Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
Grid_quiesce_nodes();
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
Grid_unquiesce_nodes();
}
///////////////////////////////////////////////////////////////////////////
@ -67,14 +83,14 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
@ -84,14 +100,14 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world
////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
MPI_Comm optimal_comm;
////////////////////////////////////////////////////
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
@ -103,25 +119,25 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size();
_ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
shm_processors [pad+d]=parent._shm_processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ;
// int Nparent = parent._processors ;
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
@ -132,54 +148,25 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent
Coordinate ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
int crank;
int crank;
// Mpi uses the reverse Lexico convention to us; so reversed routines called
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
MPI_Comm comm_split;
if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
if ( Nchild > 1 ) {
////////////////////////////////////////////////////////////////
// Split the communicator
@ -202,13 +189,13 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
// Take the right SHM buffers
//////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split);
///////////////////////////////////////////////
// Free the temp communicator
// Free the temp communicator
///////////////////////////////////////////////
MPI_Comm_free(&comm_split);
if(0){
if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
for(int d=0;d<processors.size();d++){
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
@ -219,7 +206,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
}
}
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
@ -236,7 +223,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
Coordinate periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -269,8 +256,27 @@ CartesianCommunicator::~CartesianCommunicator()
for(int i=0;i<communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]);
}
}
}
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
@ -279,6 +285,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
@ -287,8 +297,14 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
@ -296,16 +312,49 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
MPI_Request xrq;
MPI_Request rrq;
assert(dest != _processor);
assert(from != _processor);
int tag;
tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
@ -313,81 +362,58 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<MpiCommsRequest_t> reqs(0);
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
// Enforce no UVM in comms, device or host OK
assert(acceleratorIsCommunicable(xmit));
assert(acceleratorIsCommunicable(recv));
// Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
int dest, int dox,
void *recv,
int from,
int from, int dor,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
int dest,int dox,
void *recv,
int from,
int bytes,int dir)
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
@ -402,46 +428,363 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
int tag;
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=rbytes;
}
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
}
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
if (nreq==0) return;
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
// int nreq=list.size();
// if (nreq==0) return;
// std::vector<MPI_Status> status(nreq);
// std::vector<MPI_Request> MpiRequests(nreq);
// for(int r=0;r<nreq;r++){
// MpiRequests[r] = list[r].req;
// }
// int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
// assert(ierr==0);
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
//{
//}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
@ -456,11 +799,15 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
communicator);
assert(ierr==0);
}
int CartesianCommunicator::RankWorld(void){
int r;
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BarrierWorld(void){
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
@ -473,7 +820,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
@ -490,7 +837,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
@ -502,7 +849,4 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
MPI_Type_free(&object);
}
}
NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -23,11 +23,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
@ -38,21 +38,23 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
{
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
_shm_processors = Coordinate(processors.size(),1);
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_shm_processors = Coordinate(processors.size(),1);
_processors = processors;
_ndimension = processors.size();
_ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake
@ -67,24 +69,18 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalMax(float &){}
void CartesianCommunicator::GlobalMax(double &){}
void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalSum(uint32_t &){}
void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit,
@ -95,20 +91,17 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{
assert(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
int bytes,int dir)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
@ -122,8 +115,9 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; }
void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =0;
@ -131,35 +125,39 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,
int recv_from_rank,int dor,
int bytes, int dir)
{
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,
int bytes, int dir)
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
return xbytes+rbytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void){};
NAMESPACE_END(Grid);
}

View File

@ -28,10 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
// static data
int GlobalSharedMemory::HPEhypercube = 1;
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup;
@ -39,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
void * GlobalSharedMemory::HostCommBuf;
#endif
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
@ -65,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
void *SharedMemory::HostBufferMalloc(size_t bytes){
void *ptr = (void *)host_heap_top;
host_heap_top += bytes;
host_heap_bytes+= bytes;
if (host_heap_bytes >= host_heap_size) {
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size);
}
return ptr;
}
void SharedMemory::HostBufferFreeAll(void) {
host_heap_top =(size_t)HostCommBuf;
host_heap_bytes=0;
}
#endif
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
@ -73,9 +97,12 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size);
}
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr;
}
void SharedMemory::ShmBufferFreeAll(void) {
@ -84,9 +111,62 @@ void SharedMemory::ShmBufferFreeAll(void) {
}
void *SharedMemory::ShmBufferSelf(void)
{
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
NAMESPACE_END(Grid);

View File

@ -25,18 +25,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once
#include <Grid/GridCore.h>
@ -53,30 +41,65 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
class GlobalSharedMemory {
private:
private:
static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation
static int _ShmSetup;
static int _ShmAlloc;
static uint64_t _ShmAllocBytes;
public:
public:
///////////////////////////////////////
// HPE 8600 hypercube optimisation
///////////////////////////////////////
static int HPEhypercube;
static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
@ -84,7 +107,9 @@ class GlobalSharedMemory {
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
static void *HostCommBuf;
#endif
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
@ -102,12 +127,18 @@ class GlobalSharedMemory {
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
// Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};
@ -116,14 +147,21 @@ class GlobalSharedMemory {
//////////////////////////////
class SharedMemory
{
private:
private:
static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top;
size_t heap_bytes;
size_t heap_size;
protected:
#ifndef ACCELERATOR_AWARE_MPI
size_t host_heap_top; // set in free all
size_t host_heap_bytes;// set in free all
void *HostCommBuf; // set in SetCommunicator
size_t host_heap_size; // set in SetCommunicator
#endif
protected:
Grid_MPI_Comm ShmComm; // for barriers
int ShmRank;
@ -131,7 +169,7 @@ class SharedMemory
std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public:
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
@ -148,12 +186,16 @@ class SharedMemory
// Call on any instance
///////////////////////////////////////////////////
void SharedMemoryTest(void);
void *ShmBufferSelf(void);
void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
#ifndef ACCELERATOR_AWARE_MPI
void *HostBufferMalloc(size_t bytes);
void HostBufferFreeAll(void);
#endif
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////
@ -162,4 +204,5 @@ class SharedMemory
};
}
NAMESPACE_END(Grid);

View File

@ -7,6 +7,7 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -26,10 +27,137 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/
/* END LEGAL */
#define Mheader "SharedMemoryMpi: "
#include <Grid/GridCore.h>
#include <pwd.h>
namespace Grid {
#ifdef GRID_CUDA
#include <cuda_runtime_api.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCL
#ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif
#include <syscall.h>
#endif
#include <sys/socket.h>
#include <sys/un.h>
NAMESPACE_BEGIN(Grid);
#ifdef SHM_SOCKETS
/*
* Barbaric extra intranode communication route in case we need sockets to pass FDs
* Forced by level_zero not being nicely designed
*/
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
sendmsg(sock, &msg, 0);
};
};
#endif
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@ -43,15 +171,26 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
#else
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
#endif
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
/////////////////////////////////////////////////////////////////////
@ -130,9 +269,25 @@ int Log2Size(int TwoToPower,int MAXLOG2)
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
//////////////////////////////////////////////////////////////////////////////
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
int R;
int I;
int N;
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
#ifdef HYPERCUBE
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
@ -173,9 +328,9 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
}
std::string hname(name);
std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
// std::cout << "hostname "<<hname<<std::endl;
// std::cout << "R " << R << " I " << I << " N "<< N
// << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
@ -197,17 +352,15 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors;
Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension);
Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension);
Coordinate HyperCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM = ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
@ -225,7 +378,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits;
}
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
@ -253,27 +406,20 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#else
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension);
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM=ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
@ -306,7 +452,6 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
@ -314,7 +459,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
@ -337,7 +482,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("size %ld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
@ -369,14 +514,237 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
// The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
// e.g. DGX1, supermicro board,
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef SHM_SOCKETS
UnixSockets::Open(WorldShmRank);
#endif
for(int r=0;r<WorldShmSize;r++){
MPI_Barrier(WorldShmComm);
#ifndef GRID_MPI3_SHM_NONE
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
#ifdef SHM_SOCKETS
for(int rr=0;rr<WorldShmSize;rr++){
if(rr!=r){
UnixSockets::SendFileDescriptor(handle.fd,rr);
}
}
#endif
}
#endif
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
MPI_Barrier(WorldShmComm);
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
}
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
int myfd;
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
if (myfd < 0) {
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
perror("pidfd_getfd failed ");
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
}
WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
MPI_Barrier(WorldShmComm);
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -413,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@ -423,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -470,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
@ -499,7 +867,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
@ -536,14 +903,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes;
}
#endif
#endif // End NVCC case for GPU device buffers
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
// Routines accessing shared memory should route through for GPU safety
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorMemSet(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
int rank, size;
@ -554,7 +938,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
#else
MPI_Comm_split(comm, rank, 0, &ShmComm);
#endif
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize);
@ -571,10 +959,15 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
}
ShmBufferFreeAll();
#ifndef ACCELERATOR_AWARE_MPI
host_heap_size = heap_size;
HostCommBuf= GlobalSharedMemory::HostCommBuf;
HostBufferFreeAll();
#endif
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
@ -584,6 +977,19 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_SHM_FORCE_MPI
// Hide the shared memory path between ranks
{
for(int r=0;r<size;r++){
if ( r!=rank ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
#endif
//SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@ -598,24 +1004,26 @@ void SharedMemory::ShmBarrier(void)
void SharedMemory::SharedMemoryTest(void)
{
ShmBarrier();
uint64_t check[3];
uint64_t magic = 0x5A5A5A;
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GlobalSharedMemory::WorldNode;
check[1] = r;
check[2] = 0x5A5A5A;
for(uint64_t r=0;r<ShmSize;r++){
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
}
}
ShmBarrier();
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
}
void *SharedMemory::ShmBuffer(int rank)
@ -629,7 +1037,6 @@ void *SharedMemory::ShmBuffer(int rank)
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){
@ -648,4 +1055,5 @@ SharedMemory::~SharedMemory()
}
};
}
NAMESPACE_END(Grid);

Some files were not shown because too many files have changed in this diff Show More