1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Compare commits

...

491 Commits

Author SHA1 Message Date
9203126aa5 Scripts 2025-06-11 15:30:16 +02:00
f90ba4712a Update for Jupiter 2025-06-11 15:24:34 +02:00
3737a24096 Updated python output 2025-06-03 14:09:29 -04:00
d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
b517e88db3 Update README 2025-05-13 16:49:21 -04:00
bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
644cc6647e JSON update 2025-05-13 12:50:58 +00:00
72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
3811d19298 Fence 2025-04-04 18:35:06 -04:00
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
066544281f Deprecate UVM 2024-09-17 13:34:27 +00:00
11be10d2c0 Aurora testing 2024-09-10 18:11:52 +00:00
160969a758 UVM tester, doesn't turn up anything 2024-09-10 18:09:42 +00:00
622f78ebea SYCL updates -- operator = giving trouble on Aurora.
SYCL reduction is failing intermittently with SVM interface - returns
zero, expect non-zero.
Think I need to remove ALL dependence on SVM.
2024-09-04 13:53:48 +00:00
aa67a5b095 Rename 2024-08-27 19:54:01 +00:00
af9ea0864c Blas fix 2024-08-27 19:53:09 +00:00
4e2a6d87c4 Gemm batched fix 2024-08-27 19:24:05 +00:00
a465ecece9 Aurora 2024-08-27 19:20:43 +00:00
575eb72182 Converges on 16^3 2024-08-27 19:20:38 +00:00
3a973914d6 Compile on frontier 2024-08-27 14:55:42 -04:00
f568c07bbd Improved the BLAS benchmark 2024-08-27 14:53:54 -04:00
2c9878fc3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-27 12:05:46 -04:00
27b1b1b005 Checkerboard available for offloading pickCheckerboard 2024-08-27 12:04:09 -04:00
130d7ab077 Verbose changes 2024-08-27 12:03:28 -04:00
29f6b8a74a Setup 2024-08-27 12:02:49 -04:00
9779aaea33 16^3 optimise 2024-08-27 11:38:35 -04:00
ec25604a67 Fastest solver for mrhs multigrid 2024-08-27 11:32:34 -04:00
3668e81c5e Extract slice working on checkerboard field for Block Lanczos 2024-08-27 11:31:30 -04:00
d66b2423cb Move slice operations to GPU for BlockCG 2024-08-27 11:28:47 -04:00
15cc78f0b6 peek/poke local site on checkerboard arrays 2024-08-27 11:23:42 -04:00
06db4ddea2 Fast init on GPU 2024-08-27 11:22:33 -04:00
6cfb90e99f Support needed for accelerator resident set/pick Checkerboard 2024-08-27 11:19:00 -04:00
d8be95a2a3 Don't early terminate power method to get more accurate top EV 2024-08-27 11:17:37 -04:00
f82702872d Normal residual 2024-08-27 11:16:44 -04:00
3752c49ef0 Add option to record the CG polynomial 2024-08-27 11:14:35 -04:00
fe65fa4988 MulMatrix 2024-08-27 11:13:18 -04:00
1fe4c205a3 Adef 2024-08-27 11:11:47 -04:00
d4dc5e0f43 BlockCG linalg acceleratoin with BLAS 2024-08-27 11:08:33 -04:00
77944437ce Functor initialisation 2024-08-27 11:01:02 -04:00
c164bff758 MMdag 2024-08-27 11:00:36 -04:00
aa2e3d954a MMdag operator 2024-08-27 10:59:29 -04:00
de62b04728 Block CG linalg acceleration 2024-08-27 10:58:54 -04:00
d0bdb50f24 Analyse power spectrum 2024-08-27 10:58:19 -04:00
a8fecbc609 BlockCG linalg via BLAS 2024-08-21 16:08:16 -04:00
557fa483ff Blas benchmark committed stand alone 2024-08-20 16:18:43 +00:00
fc15d55df6 Mallinfo 2024-08-20 14:33:09 +00:00
53573d7d94 Better benchmark 2024-08-20 14:31:57 +00:00
bb3c177000 Better benchmarking 2024-08-20 14:31:41 +00:00
a3322b470f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-20 14:30:52 +00:00
f8f408e7a9 BLAS everywhere 2024-07-25 18:09:02 +00:00
baac1127d0 Later intel compiler happiness 2024-07-25 18:06:05 +00:00
6f1328160c Remove SVM use 2024-07-25 18:05:40 +00:00
04cf902791 Mallinfo and ASAN hooks 2024-07-25 18:04:56 +00:00
7a5b1c1a19 Try Catch convenience macro 2024-07-25 18:03:41 +00:00
18d2d7da4a Eigen implementation and SYCL implementation 2024-07-25 18:02:56 +00:00
b461184797 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-23 09:53:58 -04:00
4563b39305 New Frontier config 2024-07-23 09:53:08 -04:00
c9d5674d5b FInal for paper 2024-07-22 15:26:45 -04:00
486412635a 8^4 test for PETSc 2024-07-22 15:25:17 -04:00
8b23a1546a Force compile temporarily 2024-07-22 15:24:56 -04:00
a901e4e369 Regressed performance for paper 2024-07-22 15:24:04 -04:00
804d9367d4 Regressed performance 2024-07-22 15:23:25 -04:00
41d8adca95 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-11 15:38:45 +00:00
059e8e5bb0 New compile option 2024-07-11 15:37:30 +00:00
b3ee8ded96 Respect command line 2024-07-11 15:34:48 +00:00
cf3584ad15 Convenient to monitor memory across an HMC trajectory 2024-07-11 15:30:32 +00:00
a66973163f Device vector not UVM 2024-07-11 15:24:11 +00:00
4502a8c8a1 libc malloc heap info dump on Linux 2024-07-11 15:22:18 +00:00
9c902e4c2d Batched blas, but not working yet on OneAPI 2024-07-11 15:19:49 +00:00
f3eb36adcf Namespace addition 2024-07-11 15:19:19 +00:00
7c246606c1 Schur additional case 2024-07-10 22:04:32 +00:00
172c75029e Redblack additional case 2024-07-10 22:03:59 +00:00
6ae52da571 LLVM leak sanitizer 2024-07-08 15:59:18 +00:00
4ee9c68053 Updated compile environment 2024-07-08 15:57:57 +00:00
a15b4378a3 Sanitizer preservation of options 2024-07-08 15:57:45 +00:00
89fdd7f8dd AOT compilation 2024-07-05 17:47:56 +00:00
c328be24b7 Sanitizer compile options 2024-07-05 17:46:43 +00:00
a73dc6dbf4 Display linux heap info 2024-06-28 16:05:17 +00:00
eee2a2657f Try catch exception wrappers 2024-06-28 16:02:29 +00:00
12b8be7cb9 Best so far on 96^3 350 Evecs converged on 4^4 block 2024-06-18 16:31:37 -04:00
63c223ea5d Verbose 2024-06-18 03:22:01 +00:00
2877fb4a2c More verbose if alloc failure 2024-06-18 03:21:03 +00:00
d299c86633 Std::asin,acos 2024-06-11 16:41:23 -04:00
6ce52092e8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 15:16:58 -04:00
b5926c1d21 Broadcast time info 2024-06-11 15:16:25 -04:00
9563238e9b Force initial to identity 2024-06-11 17:51:58 +00:00
fb9b1d76ca Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 16:48:16 +00:00
1739146599 Property to initialise reduction 2024-06-11 16:47:35 +00:00
ed20b39ab3 Log files from Frontier benchmark 2024-06-11 11:16:20 -04:00
284fc05f15 Protect vs. missing LIME libarary 2024-06-11 11:08:00 -04:00
07a07b6fa3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-10 15:09:25 -04:00
dc80b08969 96^3 test 2024-06-10 15:07:29 -04:00
a49a161f8d SYCL update to use buffer on reduction variable 2024-06-08 16:05:18 +00:00
a6479ca50f Shuhei's ComputeWilsonFlow main programme 2024-06-05 15:51:11 -04:00
0e607a55e7 Updated for 8^4 test 2024-05-26 20:53:05 +00:00
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
c4b9f71357 CPU compile ordering is important 2024-05-21 02:22:32 +01:00
394e506aea Compile options for tursa update 2024-05-21 02:10:04 +01:00
e19b26341b Tursa configure update 2024-05-21 01:14:27 +01:00
cfe1b13225 Back out zero change 2024-05-21 01:14:08 +01:00
890c5ea1cd Warning disable 2024-05-20 20:08:31 +01:00
a87378d3b6 Update 2024-05-20 20:08:31 +01:00
832fc08809 Merge pull request #459 from dbollweg/sycl_slicesum_update
Sycl slicesum bugfix
2024-05-20 15:06:53 -04:00
9a1ad6a5eb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-05-17 11:33:46 -04:00
a90eafad24 Merge branch 'feature/scidac-wp1' into develop 2024-05-17 11:32:00 -04:00
ad14a82742 Working aas good as possible on 48^3 in double 2024-05-16 10:55:45 -04:00
14e9d8ed9f CG improvements for smoother 2024-05-16 10:55:18 -04:00
0ac85fa70b Serialisation removal 2024-05-16 10:49:04 -04:00
c371de42b9 Some site tools for sitewise autocorr 2024-05-16 10:48:23 -04:00
ccf147d6c1 Select the compiler that gives better performance on sunspot 2024-05-07 18:45:56 +00:00
7aa12b446f New config command for sunspot 2024-05-07 18:45:40 +00:00
c293228102 layout control 2024-05-07 18:45:21 +00:00
5c4c9f721a Remove pbs file and replace with bench1 and bench2 for 1 and 2 nodes 2024-05-07 18:44:49 +00:00
057f86c1de 2 queues works ok in performance 2024-05-07 18:42:50 +00:00
cd52e3cbc2 Jobs on subspot 2024-05-07 18:38:15 +00:00
24602e1259 Accidental synchronise 2024-05-07 17:28:38 +00:00
8a098889fc Update FlightRecorder.cc 2024-04-30 21:15:08 +01:00
5c3ace7c3e Merge branch 'develop' into feature/scidac-wp1 2024-04-30 05:26:06 -04:00
aa148455b7 Updated todo list 2024-04-30 05:24:39 -04:00
98cf247f33 prepare to switch to mixed precision 2024-04-30 05:23:45 -04:00
0cf16522d1 Refine with HDCG choice 2024-04-30 05:22:14 -04:00
7b7c75f9e5 Setup 2024-04-30 05:21:02 -04:00
aefd255a3c Verbose 2024-04-30 05:20:41 -04:00
1c5aa939fd Subspace setup changes 2024-04-30 05:19:09 -04:00
3a0ff17be0 Verbose changes 2024-04-30 05:17:28 -04:00
47829ae5cc Verbose changes 2024-04-30 05:16:46 -04:00
bfa7b69aff Verbose changes 2024-04-16 15:42:46 -04:00
2aaa959b5f Printing changes 2024-04-16 15:41:25 -04:00
ce2970b93a Printing changes 2024-04-16 15:40:38 -04:00
7b76970d10 Verbose changes 2024-04-16 15:40:10 -04:00
9fd41882d2 Herm Op update 2024-04-16 15:39:27 -04:00
ff2ea5de18 Update Tensor_traits.h 2024-04-11 14:25:45 -04:00
5147a42818 Updated hdcg 2024-04-05 01:05:57 -04:00
57552d8ca3 Assign from non-lattice made accelerator resident 2024-04-05 01:05:12 -04:00
13713b2a76 Much faster little dirac operator calculation 2024-04-05 01:04:40 -04:00
36a14e4ee3 Best setup and introduce an HDCG refine method 2024-04-05 01:03:33 -04:00
b4cc788b8c First version used in mrhsHDCG
Need to consolidate files.
Plan: Make this version able to go virtual base, then absorb chulwoos
version when it is proven
2024-04-05 01:02:21 -04:00
0f0e7512f3 Keep MRHS in a different file 2024-04-05 00:59:53 -04:00
1196b1a161 Less verbose 2024-04-05 00:58:58 -04:00
2c8c3be9ee Adef2Mrhs 2024-04-05 00:57:13 -04:00
5b79d51c22 Improvements 2024-04-01 14:18:40 -04:00
da890dc293 Verbose changes 2024-04-01 14:18:00 -04:00
93d0a1e73a HISQ view call 2024-04-01 14:16:47 -04:00
f0a8c7d045 Playing with chebyshevs 2024-04-01 14:16:11 -04:00
db8793777c Logging/verbose 2024-04-01 14:15:41 -04:00
c745484e65 9.5x speed up version 2024-04-01 14:14:30 -04:00
da59379612 Large reg file for double 2024-03-26 17:03:20 +00:00
3ef2a41518 ifdef guard ommitted 2024-03-26 14:50:32 +00:00
aa96f420c6 Acclerator ware MPI guard on the Unix domain sockets 2024-03-26 14:41:25 +00:00
49e9e4ed0e Fences 2024-03-26 14:14:06 +00:00
f7b8163016 Deterministic MPI reduce options 2024-03-26 14:11:40 +00:00
93769eacd3 Updated configure for bounce through host 2024-03-26 14:10:24 +00:00
59b0cc11df REduce the time in single 2024-03-26 00:42:40 +00:00
f32c275376 Updated config options for MPI not being aware of GPU 2024-03-26 00:42:00 +00:00
5404fc66ab Merge needs a fence on SYCL 2024-03-26 00:38:41 +00:00
1f53458af8 Options to bounce through a host buffer if
--disable-accelerator-aware-mpi
2024-03-26 00:37:19 +00:00
434c3e7f1d We have a choice of GET or PUT across NVlink 2024-03-25 14:32:44 +00:00
500b119f3d Deterministic MPI 2024-03-22 15:55:23 +00:00
4b87259c1b New config command for sunspot 2024-03-22 15:43:49 +00:00
503dec34ef This appears working now on Sunspot 2024-03-22 15:43:30 +00:00
d1e9fe50d2 Xor csum for repro testing 2024-03-22 15:42:57 +00:00
d01e5fa838 Improved FlightRecorder 2024-03-22 15:42:32 +00:00
a477c25e8c Sunspot repro tests 2024-03-22 15:42:11 +00:00
1bd20cd9e8 FlightRecorder 2024-03-22 15:40:01 +00:00
e49e95b037 Upgrade of the Britney test with flight recorder and fast xor checksum 2024-03-22 15:39:27 +00:00
6f59fed563 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:32 +00:00
60b7f6c99d Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:26 +00:00
b92dfcc8d3 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:27 +00:00
f6fd6dd053 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:01 +00:00
79ad567dd5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-19 15:43:42 +00:00
fab1efb48c More britney logging improvements 2024-03-19 14:36:21 +00:00
660eb76d93 FFTW from OneAPI 2024-03-19 14:28:33 +00:00
461cd045c6 sliceSum cleanup 2024-03-13 18:18:44 -04:00
fee65d7a75 Merge branch 'paboyle:develop' into sycl_slicesum_update 2024-03-13 18:06:17 -04:00
31f9971dbf avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum 2024-03-13 13:39:26 -04:00
62e7bf024a Updated flight logging for Britney test 2024-03-12 20:10:04 +00:00
95f3d69cf9 Extra hardware test hook 2024-03-12 20:09:37 +00:00
89c0519f83 Repro test 2024-03-12 16:11:33 +00:00
2704b82084 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-12 15:16:24 +00:00
cf8632bbac Britney test option 2024-03-12 15:15:35 +00:00
d224297972 PBS scripts 2024-03-12 15:15:16 +00:00
a4d11a630f Merge pull request #458 from paboyle/fix/HOST_NAME_MAX
fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined
2024-03-07 07:50:25 -05:00
2b4399f8b1 more HOST_NAME_MAX fix 2024-03-07 15:26:01 +09:00
f17b8de907 fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined 2024-03-07 15:22:08 +09:00
d87296f3e8 Merge branch 'develop' of https://github.com/dbollweg/Grid into develop 2024-03-06 16:54:22 -05:00
be94cf1c6f Fewer wait-calls in sycl slicesum 2024-03-06 16:53:13 -05:00
cc04dc42dc Merge branch 'develop' into feature/scidac-wp1 2024-03-06 14:55:21 -05:00
7e5bd46dd3 Booster update 2024-03-06 19:03:45 +01:00
228bbb9d81 Benchmark results 2024-03-06 19:03:35 +01:00
b812a7b4c6 Staggered launch script 2024-03-06 01:32:40 +00:00
891a366f73 Repro CG script 2024-03-06 01:22:55 +00:00
10116b3be8 Force device copyable and tell SYCL to shut it. 2024-03-06 01:13:27 +00:00
a46a0f0882 force device copyable and don't take crap from SYCL 2024-03-06 01:12:49 +00:00
a26a8a38f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-06 00:05:00 +00:00
7435315d50 More blasted shell variables 2024-03-06 00:03:59 +00:00
9b5f741e85 Reproducing CG can be more useful now 2024-03-06 00:03:16 +00:00
517822fdd2 SPR HBM benchmarking right and also PVC batched GEMM 2024-03-06 00:02:27 +00:00
1b93a9be88 Print out the hostname 2024-03-06 00:01:58 +00:00
783a66b348 Deterministic reduction please 2024-03-06 00:01:37 +00:00
976c3e9b59 Hack for flight logging CG inner products.
Can be made to work, but could put in some more serious infrastructure
for repro testing and blame attribution (Britney test) if necessary
2024-03-05 23:59:57 +00:00
f8ca971dae Use of a bare PRECISION macro is not namespace safe and collides with
SYCL
2024-03-05 23:59:13 +00:00
21bc8c24df OneMKL batched blas starting 2024-03-05 23:58:20 +00:00
30228214f7 SYCL conflict with Eigen 2024-03-05 23:56:10 +00:00
2ae980ae43 Update sourceme.sh 2024-03-05 13:39:18 -05:00
6153dec2e4 Update setup.sh 2024-03-05 13:38:32 -05:00
c805f86343 USQCD benchmark 2024-03-01 00:05:04 -05:00
04ca065281 Only one rank opens 2024-02-29 20:09:11 -05:00
88d8fa43d7 Benchmark development 2024-02-29 20:01:44 -05:00
3c49762875 Propagate in the blas routine 2024-02-29 15:33:06 -05:00
436bf1d9d3 Merge pull request #455 from clarkedavida/hisq_fat_links
Hisq fat links
2024-02-29 15:29:39 -05:00
f70df6e195 changed NO_SHIFT and BACKWARD_CONST from define to enum 2024-02-29 12:29:30 -07:00
fce3852dff Merge pull request #451 from paboyle/feature/eigen-3.4.0-update
updating Eigen to 3.4.0
2024-02-28 18:03:37 -05:00
ee1b8bbdbd Merge pull request #454 from edbennett/adjoint-broke
fix HMC for non-fundamental representations
2024-02-28 14:05:27 -05:00
3f1636637d Merge pull request #453 from dbollweg/feature/sliceSum_gpu
Feature/slice sum gpu
2024-02-28 14:04:43 -05:00
2e570f5300 Merge pull request #457 from lehner/feature/gpt
Import GPT-related updates
2024-02-28 13:59:04 -05:00
9f89486df5 remove unnecessary code path 2024-02-28 19:56:23 +01:00
22b43b86cb Make GPT test suite work with SYCL 2024-02-28 12:57:17 +01:00
3c9012676a CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case. 2024-02-27 12:41:45 -05:00
b507fe209c Added SpinColourMatrix case to sliceSum Test 2024-02-27 11:28:32 -05:00
6cd2d8fcd5 Replace cuda/hip memcpy with Grid functions 2024-02-26 09:55:07 -05:00
b02d022993 fixed race condition (thx michael) 2024-02-23 17:14:28 -07:00
94581e3c7a accelerator_for is broken 2024-02-23 15:58:33 -07:00
88b52cc045 Merge branch 'develop' into hisq_fat_links 2024-02-23 14:47:15 -07:00
0a816b5509 Merge branch 'feature/sliceSum_gpu' of https://github.com/dbollweg/Grid into feature/sliceSum_gpu 2024-02-22 21:43:06 -05:00
1c8b807c2e free malloc'd memory 2024-02-22 21:42:44 -05:00
66391f84f2 Merge branch 'feature/gpt' of ../Grid into develop 2024-02-21 19:05:00 +01:00
97f7a9ecb3 fix HMC for non-fundamental representations 2024-02-21 08:27:55 +00:00
15878f7613 sliceSumReduction_cub_large now also faster than CPU on Frontier 2024-02-16 13:55:21 -05:00
e0d5e3c6c7 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-16 13:16:37 -05:00
6f3455900e Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs 2024-02-16 13:15:02 -05:00
56827d6ad6 accelerator_inline bug 2024-02-14 13:56:57 -07:00
73c0b29535 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-02-13 20:19:32 +00:00
303b83cdb8 Scaling benchmarks, verbosity and MPICH aware in acceleratorInit()
For some reason Dirichlet benchmark fails on several nodes; need to
debug this.
2024-02-13 19:48:03 +00:00
5ef4da3f29 Silence verbose 2024-02-13 19:47:36 +00:00
1502860004 Benchmark scripts 2024-02-13 19:47:02 +00:00
585efc6f3f More benchmark scripts 2024-02-13 19:40:49 +00:00
62055e04dd missing semicolon generates error with some compilers 2024-02-13 18:18:27 +01:00
e4a641b64e removing old Eigen tensor patch 2024-02-13 10:37:14 +01:00
8849f187f1 updating Eigen to 3.4.0 2024-02-13 10:30:22 +01:00
db420525b3 fix Simd::Nsimd typo 2024-02-12 15:03:53 -07:00
b5659d106e more test cases 2024-02-09 13:37:14 -05:00
4b43307402 Undo include path changes for level zero api header 2024-02-09 13:07:56 -05:00
09af8c25a2 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-09 13:02:59 -05:00
9514035b87 refactor slicesum: slicesum uses GPU version by default now 2024-02-09 13:02:28 -05:00
2da09ae99b acceleration compiles and doesn't break scalar mode 2024-02-06 18:40:13 -07:00
a38fb0e04a first effort toward accelerators 2024-02-06 18:24:55 -07:00
7019916294 RNG seed change safer for large volumes; this is a long term solution 2024-02-07 00:56:39 +00:00
1514b4f137 slicesum_sycl passes test 2024-02-06 19:08:44 -05:00
91cf5ee312 Updated bench script 2024-02-06 23:45:10 +00:00
0a6e2f42c5 small amount of cleanup 2024-02-06 16:32:07 -07:00
ab2de131bd work towards sliceSum for sycl backend 2024-02-06 13:24:45 -05:00
5bfa88be85 Aurora MPI standalone benchmake and options that work well 2024-02-06 16:28:40 +00:00
5af8da76d7 Fix cuda compilation of Lattice_slicesum_gpu.h 2024-02-01 18:02:30 -05:00
b8b9dc952d Async memcpy's and cleanup 2024-02-01 17:55:35 -05:00
79a6ed32d8 Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies 2024-02-01 16:41:03 -05:00
caa5f97723 Add sliceSum gpu using cub/hipcub 2024-01-31 16:50:06 -05:00
4924b3209e projectU3 yields a unitary matrix 2024-01-23 14:43:58 -07:00
00f24f8765 already found some bugs in projection, still needs testing 2024-01-22 05:50:16 -07:00
f5b3d582b0 first attempt at U3 projection 2024-01-22 02:49:40 -07:00
981c93d67a update Test_fatLinks to accept Naik 2024-01-21 21:09:19 -07:00
c020b78e02 Merge branch 'develop' into hisq_fat_links 2024-01-21 20:21:08 -07:00
2a0d75bac2 Aurora files 2023-12-21 23:20:17 +00:00
f48298ad4e Bug fix 2023-12-11 20:57:02 -05:00
645e47c1ba Config for Ampere Altra ARM 2023-12-08 16:17:56 -05:00
d1d9827263 Integrator logging update 2023-12-08 12:14:00 -05:00
14643c0aab SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) 2023-12-04 15:45:57 -05:00
b77a9b8947 SDDC compiles starting 2023-11-30 14:31:51 -05:00
7d077fe493 Frontier compiel 2023-11-09 13:58:44 -05:00
9cd4128833 fix naik bug 2023-11-03 14:11:38 -06:00
c8b17c9526 Naik to CShift 2023-11-02 12:43:22 -06:00
2ae2a81e85 attempt to fix Naik 2023-10-31 13:54:55 -06:00
69c869d345 fixed stupid typo 2023-10-30 17:41:52 -06:00
df9b958c40 naik now returns separately 2023-10-30 17:40:53 -06:00
3d3376d1a3 LePage works, trying Naik 2023-10-27 16:26:31 -06:00
f2648e94b9 getHostPointer added to Lattice 2023-10-23 13:47:41 +02:00
21ed6ac0f4 added floating-point support 2023-10-20 13:54:26 -06:00
7bb8ab7000 improve smearing templating 2023-10-20 08:41:02 -06:00
2c824c2641 Merge branch 'develop' into hisq_fat_links 2023-10-17 16:03:59 -06:00
391fd9cc6a try lepage term 2023-10-17 14:57:15 -06:00
51051df62c 3GeV run setup 2023-10-16 20:49:52 +03:00
33097681b9 FTHMC compiled and merged to develop 2023-10-14 00:42:55 +03:00
07e4900218 FTHMC commit 2023-10-13 18:21:57 +03:00
36ab567d67 FTHMC 3 Gev 2023-10-13 18:21:57 +03:00
e19171523b FTHMC Status at lattice conference commit 2023-10-13 18:21:56 +03:00
9626a2c7c0 Asynch handling 2023-10-13 18:21:56 +03:00
e936f5b80b IfGridTensor shorthand 2023-10-13 18:21:56 +03:00
ffc0639cb9 Running in HMC tests 2023-10-13 18:21:56 +03:00
c5b43b322c traceProduct eliminates non-contributing intermediate terms 2023-10-13 18:21:56 +03:00
c9c4576237 Improved frontier cshift 2023-10-13 18:21:56 +03:00
bf4369f72d clean up HISQSmear with decltypes 2023-10-12 12:41:06 -06:00
36600899e2 working 7-link; Grid_log; generalShift 2023-10-12 11:11:39 -06:00
b9c70d156b Merge branch 'develop' into hisq_fat_links 2023-10-10 22:44:17 -06:00
eb89579fe7 Merge remote-tracking branch 'origin/develop' into develop 2023-10-10 22:43:51 -06:00
0cfd13d18b 7-link working 2023-10-10 22:41:52 -06:00
e6ed516052 merged 2023-10-08 09:00:37 +02:00
e2a3dae1f2 Option for multiple simultaneous CartesianStencils 2023-10-08 08:58:44 +02:00
6d0c2de399 Deprecate teh PVC directory and make a PVC-OEM generic PVC target with
no queueing system dependency -- just interactive scripts
2023-10-03 17:04:20 +00:00
7786ea9921 Bug fix in script 2023-10-03 09:58:44 -07:00
d93eac7b1c Performance regressed and is OK in icpx 2023.2 2023-10-03 15:53:14 +00:00
63d9b8e8a3 Merge remote-tracking branch 'origin/develop' into hisq_fat_links 2023-09-16 23:20:31 -06:00
d247031c98 try 7-link 2023-09-16 23:18:16 -06:00
e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
affff3865f Merge branch 'develop' into hisq_fat_links 2023-08-11 23:08:04 -06:00
9c22655b5a Merge remote-tracking branch 'origin/develop' into develop 2023-08-11 23:06:42 -06:00
99d879ea7f 5-link first attempt 2023-08-11 22:56:30 -06:00
9d263d9a7d fix bug in HISQSmearing; move benchmark b/c i don't understand how makefiles work 2023-06-28 10:05:34 -06:00
9015c229dc add benchmark to see whether matrix multiplication is slower than read from object 2023-06-27 21:28:26 -06:00
a7eabaad56 rudimentary appendShift convenience method, which allows the user to append an arbitrary shift in one line 2023-06-26 23:59:28 -06:00
eeb4703b84 develop wrappers to make the stencils easier to construct 2023-06-26 17:45:35 -06:00
a07421b3d3 Merge branch 'develop' into hisq_fat_links 2023-06-26 13:51:32 -06:00
cda53b4068 Merge remote-tracking branch 'origin/develop' into develop 2023-06-26 13:51:06 -06:00
df99f227c1 include missing staple orientations; invert path direction, which was backwards 2023-06-22 14:57:10 -06:00
d536c67b9d add HISQSmearing to Smearing.h 2023-06-20 16:04:48 -06:00
f44f005dad rename _lvl1 --> _linkTreatment 2023-06-20 15:48:27 -06:00
26b2caf570 add template parameter to Smear_HISQ_fat for MILC interfacing 2023-06-20 15:37:54 -06:00
8bb078db25 Merge branch 'develop' into hisq_fat_links 2023-06-20 13:05:00 -06:00
b61ba40023 Merge remote-tracking branch 'origin/develop' into develop 2023-06-20 13:04:53 -06:00
452bf2e907 Accelerator basisRotate also on HIP 2023-06-20 20:36:24 +03:00
14d352ea4f added smearParams struct 2023-06-12 16:55:44 -06:00
1cf9ec1cce now compiles 2023-06-09 16:27:45 -06:00
4b994a1bc7 trouble with compilation 2023-06-08 17:37:25 -06:00
e506d6d369 Merge branch 'develop' into hisq_fat_links 2023-06-07 21:16:20 -06:00
ab56ad8d7a fix 3-link stencil 2023-06-07 21:14:58 -06:00
e8c29e2fe5 Merge pull request #31 from paboyle/develop
Sync
2023-05-28 16:13:12 +02:00
3825329f8e Merge branch 'develop' into hisq_fat_links 2023-05-24 15:37:25 -06:00
c7bdf2c0e4 3-link test at least gives an answer 2023-05-21 04:33:20 -06:00
da9cbfc7cc Suppress BuildSurfaceList verbosity in Stencil.h 2023-05-19 20:22:20 +02:00
6b9f07c1ed Merge pull request #30 from paboyle/develop
Merge upstream
2023-05-19 20:20:58 +02:00
bf91778550 verbose plaquette example; fat link test frame 2023-05-17 15:15:54 -06:00
5f75735dab Add M and Mdag to WilsonTMFermion 2023-04-06 18:25:05 +02:00
338 changed files with 24209 additions and 5310 deletions

4
.gitignore vendored
View File

@ -1,3 +1,7 @@
# Doxygen stuff
html/*
latex/*
# Compiled Object files #
#########################
*.slo

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@ -0,0 +1,5 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@ -34,7 +34,7 @@
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif

View File

@ -30,9 +30,14 @@ directory
#include <type_traits>
#include <cassert>
#include <exception>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@ -50,6 +50,7 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
@ -74,6 +75,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/iterative/AdefGeneric.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/multigrid/MultiGrid.h>
NAMESPACE_CHECK(multigrid);

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#ifdef USE_MKL
#if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h>
#else
#include <fftw3.h>
@ -168,6 +168,7 @@ public:
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl;
assert(0);
#else
conformable(result.Grid(),vgrid);
@ -190,7 +191,8 @@ public:
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -213,6 +215,7 @@ public:
else if ( sign == forward ) div = 1.0;
else assert(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -226,6 +229,7 @@ public:
}
// Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
@ -247,6 +251,7 @@ public:
}
}
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
@ -269,6 +274,7 @@ public:
usec += timer.useconds();
flops+= flops_call*NN;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
@ -285,6 +291,7 @@ public:
}
result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif

View File

@ -103,6 +103,38 @@ public:
_Mat.MdagM(in,out);
}
};
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother
@ -165,10 +197,10 @@ public:
assert(0);
};
void Op (const Field &in, Field &out){
assert(0);
HermOp(in,out);
}
void AdjOp (const Field &in, Field &out){
assert(0);
HermOp(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
@ -245,6 +277,38 @@ public:
assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several

View File

@ -45,6 +45,11 @@ public:
M(in,tmp);
Mdag(tmp,out);
}
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;

View File

@ -59,7 +59,7 @@ public:
RealD diff = hi-lo;
RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
delta*=1.02;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
@ -131,6 +131,26 @@ public:
Coeffs[j] = s * 2.0/order;
}
};
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
@ -249,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1;

View File

@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to PRECISION for external use only */
/* Converting everything to ZOLO_PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
}
zolotarev_data* higham(PRECISION epsilon, int n) {
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
#ifdef TEST
#undef ZERO
#define ZERO ((PRECISION) 0)
#define ZERO ((ZOLO_PRECISION) 0)
#undef ONE
#define ONE ((PRECISION) 1)
#define ONE ((ZOLO_PRECISION) 1)
#undef TWO
#define TWO ((PRECISION) 2)
#define TWO ((ZOLO_PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R;
ZOLO_PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
/* Evaluate the rational approximation R(x) using the partial fraction form */
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> alpha[rdata -> da - 1];
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> beta[0] * x;
ZOLO_PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION T;
ZOLO_PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
@ -607,7 +607,7 @@ int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
PRECISION y;
ZOLO_PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
@ -626,13 +626,13 @@ int main(int argc, char** argv) {
}
rdata = type == 2
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
? higham((ZOLO_PRECISION) eps, n)
: zolotarev((ZOLO_PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
@ -681,15 +681,15 @@ int main(int argc, char** argv) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((PRECISION) x, rdata);
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));

View File

@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
#ifndef ZOLO_PRECISION
#define ZOLO_PRECISION double
#endif
#define ZPRECISION PRECISION
#define ZPRECISION ZOLO_PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
@ -77,8 +77,8 @@ typedef struct {
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_BEGIN(Grid);
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
NAMESPACE_END(Grid);

View File

@ -31,10 +31,16 @@ Author: Peter Boyle <pboyle@bnl.gov>
#include <hipblas/hipblas.h>
#endif
#ifdef GRID_CUDA
#include <hipblas/hipblas.h>
#include <cublas_v2.h>
#endif
#ifdef GRID_SYCL
#error // need oneMKL version
#include <oneapi/mkl.hpp>
#endif
#if 0
#define GRID_ONE_MKL
#endif
#ifdef GRID_ONE_MKL
#include <oneapi/mkl.hpp>
#endif
///////////////////////////////////////////////////////////////////////
@ -46,12 +52,15 @@ NAMESPACE_BEGIN(Grid);
typedef hipblasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_CUDA
typedef cudablasHandle_t gridblasHandle_t;
typedef cublasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_SYCL
typedef int32_t gridblasHandle_t;
typedef sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef GRID_ONE_MKL
typedef sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t;
#endif
@ -70,12 +79,20 @@ public:
#ifdef GRID_CUDA
std::cout << "cublasCreate"<<std::endl;
cublasCreate(&gridblasHandle);
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
#endif
#ifdef GRID_HIP
std::cout << "hipblasCreate"<<std::endl;
hipblasCreate(&gridblasHandle);
#endif
#ifdef GRID_SYCL
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
sycl::gpu_selector selector;
sycl::device selectedDevice { selector };
sycl::property_list q_prop{sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif
gridblasInit=1;
}
@ -110,6 +127,9 @@ public:
#endif
#ifdef GRID_SYCL
accelerator_barrier();
#endif
#ifdef GRID_ONE_MKL
gridblasHandle->wait();
#endif
}
@ -188,6 +208,9 @@ public:
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -247,23 +270,169 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k, ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexD *) &alpha_p[0],
(const ComplexD **)&Amk[0], (const int64_t *)&lda64,
(const ComplexD **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexD *) &beta_p[0],
(ComplexD **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#if 0
// This code was used to check the mat mul on Sunspot/OneMKL
std::cerr << " Called SYCL batched ZGEMM OpA "<< OpA << " OpB "<<OpB <<std::endl;
std::vector<ComplexD> A(m*k); // pointer list to matrices
std::vector<ComplexD> B(k*n);
std::vector<ComplexD> C(m*n);
// int sda = lda*k;
// int sdb = ldb*k;
// int sdc = ldc*n;
std::cerr << " Checking the GEMM results "<<std::endl;
for (int p = 0; p < 1; ++p) {
ComplexD * Amk_p; // pointer list to matrices
ComplexD * Bkn_p; // pointer list to matrices
ComplexD * Cmn_p; // pointer list to matrices
acceleratorCopyFromDevice((void *)&Amk[p],(void *)&Amk_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Bkn[p],(void *)&Bkn_p,sizeof(ComplexD*));
acceleratorCopyFromDevice((void *)&Cmn[p],(void *)&Cmn_p,sizeof(ComplexD*));
std::cerr << " p " << p << " copied pointers "<<std::endl;
acceleratorCopyFromDevice((void *)Amk_p,(void *)&A[0],m*k*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Bkn_p,(void *)&B[0],k*n*sizeof(ComplexD));
acceleratorCopyFromDevice((void *)Cmn_p,(void *)&C[0],m*n*sizeof(ComplexD));
std::cerr << " p " << p << " copied matrices "<<std::endl;
std::cerr << " C[0] "<<C[0]<<std::endl;
std::cerr << " A[0] "<<A[0]<<std::endl;
std::cerr << " B[0] "<<B[0]<<std::endl;
std::cerr << " m "<<m<<std::endl;
std::cerr << " n "<<n<<std::endl;
std::cerr << " k "<<k<<std::endl;
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk) {
int idx_a, idx_b;
// int lda = m; // m x k column major
// int ldb = k; // k x n column major
// int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N) {
idx_a =kk + mm*lda;
} else {
idx_a =mm + kk*lda;
}
if(OpB!=GridBLAS_OP_N) {
idx_b =nn + kk*ldb;
} else {
idx_b =kk + nn*ldb;
}
// std::cerr << " idx_a "<<idx_a<<" idx_b "<<idx_b<<std::endl;
ComplexD Ac = A[idx_a];
ComplexD Bc = B[idx_b];
if(OpA==GridBLAS_OP_C) Ac = conjugate(Ac);
if(OpB==GridBLAS_OP_C) Bc = conjugate(Bc);
c_mn += Ac*Bc;
}
std::cerr << " beta "<<beta<<" alpha "<<alpha<<" C_"<<mm<<","<<nn<<" "<<c_mn<<" "<<C[mm + nn*ldc]<<std::endl;
}
}
}
}
#endif
// synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
@ -284,6 +453,9 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -344,21 +516,111 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(ComplexF *) &alpha_p[0],
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(ComplexF *) &beta_p[0],
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k, ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
}
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
@ -381,6 +643,9 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -440,21 +705,81 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k, ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
}
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
@ -465,7 +790,6 @@ public:
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
@ -478,6 +802,9 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
@ -538,148 +865,136 @@ public:
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
/*
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
onemkl::transpose::N,
onemkl::transpose::N,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
1,&batchCount64);
*/
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(const double **)&Amk[0], (const int64_t *)&lda64,
(const double **)&Bkn[0], (const int64_t *)&ldb64,
(double *) &beta_p[0],
(double **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k, ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
}
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Strided case used by benchmark, but generally unused in Grid
// Keep a code example in double complex, but don't generate the single and real variants for now
////////////////////////////////////////////////////////////////////////////////////////////////
void gemmStridedBatched(int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // pointer list to matrices
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn,
int batchCount)
template<class CComplex>
double benchmark(int M, int N, int K, int BATCH)
{
// Use C-row major storage, so transpose calls
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
int sda = m*k;
int sdb = k*n;
int sdc = m*n;
deviceVector<ComplexD> alpha_p(1);
deviceVector<ComplexD> beta_p(1);
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
#ifdef GRID_HIP
auto err = hipblasZgemmStridedBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda, sda,
(hipblasDoubleComplex *) Bkn, ldb, sdb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc, sdc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasZgemmStridedBatched(gridblasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda, sda,
(cuDoubleComplex *) Bkn, ldb, sdb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc, sdc,
batchCount);
#endif
#ifdef GRID_SYCL
#warning "oneMKL implementation not made "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k, ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc];
}
}
}
#endif
}
void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil)
{
int32_t N_A = nbasis*nbasis*coarseVol*nstencil;
int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time
int32_t N_C = nbasis*nrhs*coarseVol*nstencil;
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
ComplexD alpha(1.0);
ComplexD beta (1.0);
for(int i=0;i<10;i++){
RealD t0 = usecond();
for(int s=0;s<nstencil;s++){
gemmStridedBatched(nbasis,nrhs,nbasis,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0], // m x n
coarseVol);
}
synchronise();
RealD t1 = usecond();
RealD flops = 8.0*nbasis*nbasis*nrhs*coarseVol*nstencil;
RealD bytes = 1.0*sizeof(ComplexD)*(nbasis*nbasis+nbasis*nrhs*3)*coarseVol*nstencil;
std::cout << " batched Blas call "<<i<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
std::cout << " batched Blas call "<<i<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH;
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
CComplex alpha(1.0);
CComplex beta (1.0);
RealD flops = 8.0*M*N*K*BATCH;
int ncall=1000;
deviceVector<CComplex *> As(BATCH);
deviceVector<CComplex *> Bs(BATCH);
deviceVector<CComplex *> Cs(BATCH);
for(int b = 0 ; b < BATCH;b++) {
CComplex *ptr;
ptr = &A[b*M*K]; acceleratorPut(As[b],ptr);
ptr = &B[b*K*N]; acceleratorPut(Bs[b],ptr);
ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr);
}
// Warm up call
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemmBatched(M,N,K,
alpha,
As, // m x k
Bs, // k x n
beta,
Cs);
synchronise();
}
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
assert(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@ -129,7 +129,7 @@ public:
{
int nvec = vecs.size();
typedef typename Field::vector_object vobj;
std::cout << " BlockProjector importing "<<nvec<< " vectors" <<std::endl;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
assert(vecs[0].Grid()==fine_grid);
@ -297,12 +297,12 @@ public:
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
std::cout << " BlockProjector importing coarse grid "<<nvec<< " vectors" <<std::endl;
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
@ -351,7 +351,7 @@ public:
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
std::cout << " BlockProjector importing coarse grid "<<nvec<< " vectors" <<std::endl;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid);
@ -408,6 +408,7 @@ public:
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
@ -446,10 +447,10 @@ public:
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw,
ComplexD(1.0),
scalar(1.0),
Vd,
Fd,
ComplexD(0.0), // wipe out C
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl;
@ -496,10 +497,10 @@ public:
int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis,
ComplexD(1.0),
scalar(1.0),
Vd,
Cd,
ComplexD(0.0), // wipe out C
scalar(0.0), // wipe out C
Fd);
BLAS.synchronise();
// std::cout << " blas call done"<<std::endl;

View File

@ -97,8 +97,8 @@ public:
}
void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
assert(ev<eval.size());
std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
eval[ev] = _eval;
int64_t offset = ev*vol*words;
@ -132,7 +132,6 @@ public:
int64_t vw = vol * words;
std::cout << GridLogMessage << "MultiRHSDelation for "<<nrhs<<" sources with "<<nev<<" eigenvectors "<<std::endl;
RealD t0 = usecond();
BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
@ -183,10 +182,10 @@ public:
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw,
ComplexD(1.0),
scalar(1.0),
Ed,
Rd,
ComplexD(0.0), // wipe out C
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
@ -211,10 +210,10 @@ public:
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev,
ComplexD(1.0),
scalar(1.0),
Ed, // x . nev
Cd, // nev . nrhs
ComplexD(0.0),
scalar(0.0),
Gd);
BLAS.synchronise();
@ -227,7 +226,7 @@ public:
acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
}
RealD t1 = usecond();
std::cout << GridLogMessage << "MultiRHSDelation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
std::cout << GridLogMessage << "MultiRHSDeflation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
}
};

View File

@ -538,139 +538,6 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2mrhs : public TwoLevelADEF2<Field,CoarseField,Aggregation>
{
public:
GridBase *coarsegridmrhs;
LinearFunction<CoarseField> &_CoarseSolverMrhs;
LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
LinearFunction<CoarseField> &_CoarseGuesser;
TwoLevelADEF2mrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
// LinearFunction<CoarseField> &CoarseSolver,
// LinearFunction<CoarseField> &CoarseSolverPrecise,
LinearFunction<CoarseField> &CoarseSolverMrhs,
LinearFunction<CoarseField> &CoarseSolverPreciseMrhs,
LinearFunction<CoarseField> &CoarseGuesser,
GridBase *rhsgrid,
Aggregation &Aggregates) :
TwoLevelADEF2<Field,CoarseField,Aggregation>(tol, maxit,FineLinop,Smoother,CoarseSolverMrhs,CoarseSolverPreciseMrhs,Aggregates),
_CoarseSolverMrhs(CoarseSolverMrhs),
_CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
_CoarseGuesser(CoarseGuesser)
{
coarsegridmrhs = rhsgrid;
};
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
int nrhs=x.size();
std::cout << GridLogMessage<<"HDCG: fPcg Vstart for "<<nrhs<<" right hand sides" <<std::endl;
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
// = [1 - Ass_inv A] Guess + Assinv src
// = P^T guess + Assinv src
// = Vstart [Tang notation]
// This gives:
// W^T (src - A x_0) = src_s - A guess_s - r_s
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
std::cout << GridLogMessage<<"HDCG: fPcg Vstart Mrhs projecting "<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++) {
this->_Aggregates.ProjectToSubspace(PleftProj,src[rhs]); // can optimise later
InsertSliceFast(PleftProj,PleftProjMrhs,rhs,0);
this->_CoarseGuesser(PleftProj,PleftMss_proj);
InsertSliceFast(PleftMss_proj,PleftMss_projMrhs,rhs,0);
}
std::cout << GridLogMessage<<"HDCG: fPcg Vstart Mrhs coarse solve "<<std::endl;
this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj,PleftMss_projMrhs,rhs,0);
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x[rhs]);
}
}
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
int nrhs=in.size();
std::cout << " mrhs PcgM1 for "<<nrhs<<" right hand sides"<<std::endl;
MemoryManager::Print();
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(this->grid);
std::vector<Field> Min(nrhs,this->grid);
std::cout << " mrhs PcgM1 Min "<<std::endl;
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
std::cout << " mrhs Coarse ops "<<std::endl;
// Really want the coarse solver
// to do the guessing itself, knowing the eigenvectors.
// The projection to coarse space is in aggregates
// If the Aggregates have a layout change option
// they could formulate as a BLAS routine.
// Put the routines in this object
for(int rhs=0;rhs<nrhs;rhs++) {
std::cout << GridLogMessage<<" Smoother for "<<rhs<<std::endl;
this->_Smoother(in[rhs],Min[rhs]);
std::cout << GridLogMessage<<" HermOp for "<<rhs<<std::endl;
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp,-1.0,out[rhs],in[rhs]); // tmp = in - A Min
// Was
// this->_Aggregates.ProjectToSubspace(PleftProj,tmp); // can optimise later
// Now:
std::cout << GridLogMessage<<" blockProject for "<<rhs<<std::endl;
blockProjectFast(PleftProj,tmp,this->_Aggregates.subspace);
std::cout << GridLogMessage<<" InsertSlice for "<<rhs<<std::endl;
InsertSlice(PleftProj,PleftProjMrhs,rhs,0);
std::cout << GridLogMessage<<" CoarseGuesser for "<<rhs<<std::endl;
this->_CoarseGuesser(PleftProj,PleftMss_proj);
std::cout << GridLogMessage<<" InsertSlice for "<<rhs<<std::endl;
InsertSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
}
MemoryManager::Print();
std::cout << " Coarse solve "<<std::endl;
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
std::cout << " Coarse solve done"<<std::endl;
MemoryManager::Print();
for(int rhs=0;rhs<nrhs;rhs++) {
std::cout << GridLogMessage<<" Extract for "<<rhs<<std::endl;
ExtractSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
std::cout << GridLogMessage<<" Promote for "<<rhs<<std::endl;
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
// std::cout << " add for "<<rhs<<std::endl;
axpy(out[rhs],1.0,Min[rhs],tmp); // Min+tmp
}
MemoryManager::Print();
std::cout << " Extracted "<<std::endl;
}
};
template<class Field>
class TwoLevelADEF1defl : public TwoLevelCG<Field>

View File

@ -0,0 +1,734 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/AdefGeneric.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
/*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
*/
NAMESPACE_BEGIN(Grid);
template<class Field>
class TwoLevelCGmrhs
{
public:
RealD Tolerance;
Integer MaxIterations;
GridBase *grid;
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer;
GridStopWatch DeflateTimer;
GridStopWatch CoarseTimer;
GridStopWatch FineTimer;
GridStopWatch SmoothTimer;
GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions
TwoLevelCGmrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{
grid = fine;
};
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
assert(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
// std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
M3Timer.Start();
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
M3Timer.Stop();
a[rhs] = rtz[rhs]/d[rhs];
LinalgTimer.Start();
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
LinalgTimer.Stop();
}
// Compute z = M x (for *all* RHS)
M1Timer.Start();
PcgM1(r,z);
M1Timer.Stop();
RealD max_rn=0.0;
LinalgTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
// std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG:fPcg rhs "<<rhs<<" k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
LinalgTimer.Stop();
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : fine M3 "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
}
public:
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out) = 0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src) = 0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
}
virtual RealD PcgM3(const Field & p, Field & mmp){
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
};
template<class Field, class CoarseField>
class TwoLevelADEF2mrhs : public TwoLevelCGmrhs<Field>
{
public:
GridBase *coarsegrid;
GridBase *coarsegridmrhs;
LinearFunction<CoarseField> &_CoarseSolverMrhs;
LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
MultiRHSBlockProject<Field> &_Projector;
MultiRHSDeflation<CoarseField> &_Deflator;
TwoLevelADEF2mrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolverMrhs,
LinearFunction<CoarseField> &CoarseSolverPreciseMrhs,
MultiRHSBlockProject<Field> &Projector,
MultiRHSDeflation<CoarseField> &Deflator,
GridBase *_coarsemrhsgrid) :
TwoLevelCGmrhs<Field>(tol, maxit,FineLinop,Smoother,Projector.fine_grid),
_CoarseSolverMrhs(CoarseSolverMrhs),
_CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
_Projector(Projector),
_Deflator(Deflator)
{
coarsegrid = Projector.coarse_grid;
coarsegridmrhs = _coarsemrhsgrid;// Thi could be in projector
};
// Override Vstart
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
int nrhs=x.size();
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
// = [1 - Ass_inv A] Guess + Assinv src
// = P^T guess + Assinv src
// = Vstart [Tang notation]
// This gives:
// W^T (src - A x_0) = src_s - A guess_s - r_s
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
this->_Projector.blockProject(src,PleftProj);
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->_Projector.blockPromote(x,PleftMss_proj);
}
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
int nrhs=in.size();
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
std::vector<Field> tmp(nrhs,this->grid);
std::vector<Field> Min(nrhs,this->grid);
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop();
}
this->ProjectTimer.Start();
this->_Projector.blockProject(tmp,PleftProj);
this->ProjectTimer.Stop();
this->DeflateTimer.Start();
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
this->DeflateTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->InsertTimer.Stop();
this->CoarseTimer.Start();
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
this->CoarseTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->InsertTimer.Stop();
this->PromoteTimer.Start();
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop();
this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
}
// this->zzz=out[0];
this->FineTimer.Stop();
}
};
NAMESPACE_END(Grid);

View File

@ -31,6 +31,58 @@ directory
NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
//////////////////////////////////////////////////////////////////////////
@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R)
{
InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint();
Cinv = C.inverse();
@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q;
@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer;
SolverTimer.Start();
RealD max_resid=0;
int k;
for (k = 1; k <= MaxIterations; k++) {
@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
max_resid=0;
RealD rrsum=0;
RealD rr;
@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
IterationsToComplete = k;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation:
//--------------------------
@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
}
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);

View File

@ -38,12 +38,13 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template <class Field>
class ConjugateGradient : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
@ -54,11 +55,26 @@ public:
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
ErrorOnNoConverge(err_on_no_conv)
{};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
GRID_TRACE("ConjugateGradient");
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
@ -66,22 +82,32 @@ public:
RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src);
Field mmp(src);
Field r(src);
// Was doing copies
ConstructTimer.Start();
Field p (src.Grid());
Field mmp(src.Grid());
Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
NormTimer.Start();
ssq = norm2(src);
RealD guess = norm2(psi);
NormTimer.Stop();
assert(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) {
r = src;
p = r;
a = ssq;
} else {
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
}
cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src
if (ssq == 0.){
@ -111,6 +137,7 @@ public:
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
PreambleTimer.Stop();
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
@ -156,6 +183,7 @@ public:
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop();
if ( (k % 500) == 0 ) {
@ -183,13 +211,14 @@ public:
<< "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
// std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver Elapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
@ -202,18 +231,143 @@ public:
}
}
// Failed. Calculate true residual before giving up
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
TrueResidual = sqrt(norm2(p)/ssq);
// Linop.HermOpAndNorm(psi, mmp, d, qq);
// p = mmp - src;
//TrueResidual = sqrt(norm2(p)/ssq);
// TrueResidual = 1;
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
<<" residual "<< TrueResidual<< std::endl;
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage<< "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@ -102,11 +102,11 @@ public:
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
// remove dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;

View File

@ -123,11 +123,11 @@ public:
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;

View File

@ -156,11 +156,11 @@ public:
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
const int primary =0;

File diff suppressed because it is too large Load Diff

View File

@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);

View File

@ -60,6 +60,32 @@ public:
}
};
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> {
private:
LinearOperatorBase<Field> & _Matrix;

View File

@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0;
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_EST_ = 100;
const int _MAX_ITER_EST_ = 200;
for (int i=0;i<_MAX_ITER_EST_;i++) {
@ -30,18 +30,17 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n);
RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
// evalMaxApprox = na;
// return evalMaxApprox;
// }
evalMaxApprox = na;
src_n = tmp;
}
assert(0);
return 0;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
};
}

View File

@ -0,0 +1,76 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){
psi=Zero();
// psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;

View File

@ -499,6 +499,87 @@ namespace Grid {
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta

View File

@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x)
@ -95,7 +97,7 @@ public:
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
@ -108,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]);
@ -124,6 +126,53 @@ public:
}
}
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
@ -160,14 +209,21 @@ public:
int b =0;
{
ComplexD ip;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
@ -213,8 +269,18 @@ public:
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
ComplexD ip;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
@ -228,6 +294,70 @@ public:
}
assert(b==nn);
}
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
@ -310,6 +440,65 @@ public:
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevNew(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double hi
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
//#opt2(x) = acheb(x,3,90,300)* acheb(x,1,90,50) * acheb(x,0.5,90,200) * acheb(x,0.05,90,400) * acheb(x,0.01,90,1500)
/*266
Chebyshev<FineField> Cheb1(3.0,hi,300);
Chebyshev<FineField> Cheb2(1.0,hi,50);
Chebyshev<FineField> Cheb3(0.5,hi,300);
Chebyshev<FineField> Cheb4(0.05,hi,500);
Chebyshev<FineField> Cheb5(0.01,hi,2000);
*/
/* 242 */
/*
Chebyshev<FineField> Cheb3(0.1,hi,300);
Chebyshev<FineField> Cheb2(0.02,hi,1000);
Chebyshev<FineField> Cheb1(0.003,hi,2000);
8?
*/
/* How many??
*/
Chebyshev<FineField> Cheb2(0.001,hi,2500); // 169 iters on HDCG after refine
Chebyshev<FineField> Cheb1(0.02,hi,600);
// Chebyshev<FineField> Cheb2(0.001,hi,1500);
// Chebyshev<FineField> Cheb1(0.02,hi,600);
Cheb1(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb1 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
Cheb2(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb2 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb3(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb3 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb4(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb4 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb5(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb5 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
subspace[b] = noise;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<< " norm " << norm2(noise)<<std::endl;
}
}
virtual void CreateSubspaceMultishift(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
@ -369,11 +558,46 @@ public:
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b++)
{
RealD MirsShift = Lo;
ConjugateGradient<FineField> CGsloppy(tol,maxit,false);
ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,MirsShift);
ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,Lo);
tmp=Zero();
CGsloppy(hermop,subspace[b],tmp);
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b]=tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspaceHDCG(LinearOperatorBase<FineField> &hermop,
TwoLevelADEF2mrhs<FineField,CoarseVector> & theHDCG,
int nrhs)
{
std::vector<FineField> src_mrhs(nrhs,FineGrid);
std::vector<FineField> res_mrhs(nrhs,FineGrid);
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b+=nrhs)
{
tmp = subspace[b];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b] =tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "before filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
src_mrhs[r] = subspace[b+r];
}
for(int r=0;r<nrhs;r++){
res_mrhs[r] = Zero();
}
theHDCG(src_mrhs,res_mrhs);
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
tmp = res_mrhs[r];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b+r]=tmp;
}
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "after filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
@ -381,3 +605,4 @@ public:
};
NAMESPACE_END(Grid);

View File

@ -99,7 +99,7 @@ public:
CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor;
deviceVector<RealD> dag_factor;
///////////////////////
// Interface
@ -124,9 +124,13 @@ public:
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -161,7 +165,7 @@ public:
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
};
void Mdag (const CoarseVector &in, CoarseVector &out)
@ -190,9 +194,14 @@ public:
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -201,10 +210,10 @@ public:
int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++)
points[p] = geom.points_dagger[p];
deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) {
acceleratorPut(points[p],geom.points_dagger[p]);
}
auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0];
@ -236,7 +245,7 @@ public:
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
void MdirComms(const CoarseVector &in)
@ -251,8 +260,14 @@ public:
out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite);
@ -285,7 +300,7 @@ public:
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{
@ -469,14 +484,20 @@ public:
// determine in what order we need the points
int npoint = geom.npoint-1;
Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++)
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) {
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -539,7 +560,7 @@ public:
});
}
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@ -590,11 +611,13 @@ public:
}
// GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, {
int j = i/nbasis;
int k = i%nbasis;
dag_factor[i] = dag_factor_eigen(j, k);
h_dag_factor[i] = dag_factor_eigen(j, k);
});
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
}
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,

View File

@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
for(int s=0;s<Subspace.subspace.size();s++){
std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
}
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -498,7 +508,6 @@ public:
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
std::cout<<" Mkl "<<k<<" "<<l<<" "<<phase<<std::endl;
}
}
invMkl = Mkl.inverse();
@ -543,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i];
phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
@ -553,15 +562,15 @@ public:
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
// std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
}
@ -570,7 +579,6 @@ public:
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
std::cout << i << " " <<k <<" "<<l<< " FT "<<norm2(FT[k])<<" "<<invMkl(l,k)<<std::endl;
}
int osites=CoarseGrid()->oSites();

View File

@ -88,7 +88,6 @@ public:
assert(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]);
}
/*
void CopyMatrix (GeneralCoarseOp &_Op)
{
for(int p=0;p<geom.npoint;p++){
@ -97,6 +96,7 @@ public:
GridtoBLAS(Aup,BLAS_A[p]);
}
}
/*
void CheckMatrix (GeneralCoarseOp &_Op)
{
std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
@ -285,6 +285,7 @@ public:
Aggregation<Fobj,CComplex,nbasis> & Subspace,
GridBase *CoarseGrid)
{
#if 0
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
@ -362,7 +363,7 @@ public:
blockZAXPY(phaF[p],pha[p],one,zz);
}
// Could save on storage here
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
@ -388,7 +389,9 @@ public:
ComputeProj[p] = coarseInner;
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
for(int k=0;k<npoint;k++){
FT = Zero();
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
@ -411,7 +414,7 @@ public:
// PopulateAdag();
// }
// Need to write something to populate Adag from A
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
@ -425,7 +428,214 @@ Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us
Takes 600s to compute matrix elements, DOMINATED by the block project.
Easy to speed up with the batched block project.
Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
// Block project below taks to 240s
Grid : Message : 328.193418 s : CoarsenOperator phase 38338 us
Grid : Message : 328.193434 s : CoarsenOperator phaseBZ 1711226 us
Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
//Grid : Message : 328.193438 s : CoarsenOperator proj 1181154 us <-- this is mistimed
//Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us <-- Cut this ~10x if lucky by loop fusion
*/
#else
RealD tproj=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
MultiRHSBlockProject<Lattice<Fobj> > Projector;
Projector.Allocate(nbasis,grid,CoarseGrid);
Projector.ImportBasis(Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
tphase=-usecond();
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
// Count use small chunks than npoint == 81 and save memory
int batch = 9;
std::vector<FineField> _MphaV(batch,grid);
std::vector<CoarseVector> TmpProj(batch,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
// std::cout << GridLogMessage << " phasing the fine vector "<<std::endl;
// Fixme : do this in batches
for(int p=0;p<npoint;p+=batch){ // Loop over momenta in npoint
for(int b=0;b<MIN(batch,npoint-p);b++){
tphaseBZ-=usecond();
phaV = phaF[p+b]*Subspace.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
// Memory footprint was an issue
tmat-=usecond();
linop.Op(phaV,MphaV);
_MphaV[b] = MphaV;
tmat+=usecond();
}
// std::cout << GridLogMessage << " Calling block project "<<std::endl;
tproj-=usecond();
Projector.blockProject(_MphaV,TmpProj);
tproj+=usecond();
// std::cout << GridLogMessage << " conj phasing the coarse vectors "<<std::endl;
for(int b=0;b<MIN(batch,npoint-p);b++){
ComputeProj[p+b] = conjugate(pha[p+b])*TmpProj[b];
}
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
// std::cout << GridLogMessage << " Starting FT inv "<<std::endl;
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT = Zero();
// 81 kernel calls as many ComputeProj vectors
// Could fuse with a vector of views, but ugly
// Could unroll the expression and run fewer kernels -- much more attractive
// Could also do non blocking.
#if 0
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#else
const int radix = 9;
int ll;
for(ll=0;ll+radix-1<npoint;ll+=radix){
// When ll = npoint-radix, ll+radix-1 = npoint-1, and we do it all.
FT = FT
+ invMkl(ll+0,k)*ComputeProj[ll+0]
+ invMkl(ll+1,k)*ComputeProj[ll+1]
+ invMkl(ll+2,k)*ComputeProj[ll+2]
+ invMkl(ll+3,k)*ComputeProj[ll+3]
+ invMkl(ll+4,k)*ComputeProj[ll+4]
+ invMkl(ll+5,k)*ComputeProj[ll+5]
+ invMkl(ll+6,k)*ComputeProj[ll+6]
+ invMkl(ll+7,k)*ComputeProj[ll+7]
+ invMkl(ll+8,k)*ComputeProj[ll+8];
}
for(int l=ll;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#endif
// 1 kernel call -- must be cheaper
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
// std::cout << GridLogMessage << " Calling GridtoBLAS "<<std::endl;
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
#endif
}
void Mdag(const CoarseVector &in, CoarseVector &out)
{
@ -503,12 +713,12 @@ Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
std::cout << GridLogMessage<<std::endl;
std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
*/
// std::cout << GridLogMessage<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
virtual void Mdiag (const Field &in, Field &out){ assert(0);};

View File

@ -54,6 +54,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@ -66,7 +69,7 @@ public:
}
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
@ -100,6 +103,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@ -145,6 +151,9 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
@ -165,19 +174,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
/*
template<class T> class vecView
@ -188,8 +188,9 @@ template<class T> class vecView
ViewMode mode;
void * cpu_ptr;
public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
vecView(Vector<T> &refer_to_me,ViewMode _mode)
{
cpu_ptr = &refer_to_me[0];
size = refer_to_me.size();
@ -205,22 +206,12 @@ template<class T> class vecView
}
};
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{
vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed
}
// Little autoscope assister
template<class View>
class VectorViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
VectorViewCloser(View &_v) : v(_v) {};
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
};
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);

View File

@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
#ifdef GRID_CUDA
cuda_mem();
#endif
DisplayMallinfo();
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }

View File

@ -211,6 +211,7 @@ private:
#endif
public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr);
static void Print(void);
static void PrintAll(void);

View File

@ -1,16 +1,15 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define MAXLINE 512
static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...)
//#define mprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return;
@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else {
assert(0);
}
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache);
}
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)

View File

@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;

View File

@ -82,6 +82,7 @@ public:
bool _isCheckerBoarded;
int LocallyPeriodic;
Coordinate _checker_dim_mask;
int _checker_dim;
public:
@ -89,7 +90,7 @@ public:
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoarded(int dim) =0;
virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;

View File

@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public:
int dummy;
Coordinate _checker_dim_mask;
// Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
@ -46,7 +46,7 @@ public:
{
return 0;
}
virtual int CheckerBoarded(int dim){
virtual int CheckerBoarded(int dim) {
return 0;
}
virtual int CheckerBoard(const Coordinate &site){
@ -106,6 +106,7 @@ public:
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension);
_lend.resize(_ndimension);

View File

@ -57,9 +57,10 @@ class GridRedBlackCartesian : public GridBase
{
public:
// Coordinate _checker_dim_mask;
int _checker_dim;
// int _checker_dim;
std::vector<int> _checker_board;
virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
@ -147,7 +148,7 @@ public:
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
virtual ~GridRedBlackCartesian() = default;
void Init(const Coordinate &dimensions,

View File

@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);

View File

@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
@ -127,7 +129,36 @@ public:
void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
@ -138,8 +169,8 @@ public:
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
@ -158,6 +189,17 @@ public:
int recv_from_rank,int do_recv,
int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,

View File

@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
@ -257,15 +258,41 @@ CartesianCommunicator::~CartesianCommunicator()
}
}
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
@ -287,27 +314,18 @@ void CartesianCommunicator::GlobalMax(double &d)
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
@ -332,7 +350,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
assert(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{
int nreq=list.size();
@ -351,9 +369,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
std::vector<MpiCommsRequest_t> reqs(0);
int myrank = _processor;
int ierr;
@ -369,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -381,11 +394,287 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@ -410,45 +699,109 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=rbytes;
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
}
#else
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
if (nreq==0) return;
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void)
{
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@ -456,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
//}
void CartesianCommunicator::Barrier(void)
{
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
@ -479,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
@ -501,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.

View File

@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{
assert(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,

View File

@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
void * GlobalSharedMemory::HostCommBuf;
#endif
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
void *SharedMemory::HostBufferMalloc(size_t bytes){
void *ptr = (void *)host_heap_top;
host_heap_top += bytes;
host_heap_bytes+= bytes;
if (host_heap_bytes >= host_heap_size) {
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size);
}
return ptr;
}
void SharedMemory::HostBufferFreeAll(void) {
host_heap_top =(size_t)HostCommBuf;
host_heap_bytes=0;
}
#endif
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;

View File

@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
@ -75,7 +107,9 @@ public:
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
static void *HostCommBuf;
#endif
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
@ -103,7 +137,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};
@ -120,6 +154,13 @@ private:
size_t heap_bytes;
size_t heap_size;
#ifndef ACCELERATOR_AWARE_MPI
size_t host_heap_top; // set in free all
size_t host_heap_bytes;// set in free all
void *HostCommBuf; // set in SetCommunicator
size_t host_heap_size; // set in SetCommunicator
#endif
protected:
Grid_MPI_Comm ShmComm; // for barriers
@ -151,7 +192,10 @@ public:
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
#ifndef ACCELERATOR_AWARE_MPI
void *HostBufferMalloc(size_t bytes);
void HostBufferFreeAll(void);
#endif
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////

View File

@ -39,9 +39,16 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCL
#ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h>
@ -512,46 +519,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@ -574,6 +541,40 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
@ -604,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
@ -738,7 +739,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#else
#ifdef GRID_MPI3_SHMMMAP
@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes);
//#else
// bcopy(src,dest,bytes);
//#endif
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@ -959,9 +959,16 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
}
ShmBufferFreeAll();
#ifndef ACCELERATOR_AWARE_MPI
host_heap_size = heap_size;
HostCommBuf= GlobalSharedMemory::HostCommBuf;
HostBufferFreeAll();
#endif
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
@ -983,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
#endif
//SharedMemoryTest();
// SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@ -1005,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
}
void *SharedMemory::ShmBuffer(int rank)

View File

@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
// acceleratorCopyToDevice(src,dest,bytes);
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality

View File

@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif
NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{

View File

@ -29,13 +29,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
extern std::vector<std::pair<int,int> > Cshift_table;
extern deviceVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void)
{
// GPU version
uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz);
}
acceleratorCopyToDevice((void *)&Cshift_table[0],
(void *)&Cshift_table_device[0],
sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0];
// CPU version use identify map
}
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@ -74,18 +89,11 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
}
{
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@ -110,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@ -121,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@ -156,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset);
}
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@ -225,18 +201,11 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
}
@ -259,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@ -268,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
@ -297,30 +257,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
}
}
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
@ -345,20 +281,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0;
if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@ -372,20 +300,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@ -409,19 +329,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0;
if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@ -432,20 +344,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}
{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
}

View File

@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
@ -52,17 +52,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
RealD t1,t0;
t0=usecond();
if ( !comm_dim ) {
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
t1=usecond();
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
@ -91,18 +94,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl;
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
//std::cout << "Two pass Cshift_comms" <<std::endl;
// std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
@ -122,21 +123,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
@ -144,26 +153,52 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond();
}
}
if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -181,15 +216,21 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
// std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
@ -198,16 +239,20 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
@ -227,7 +272,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
tgather-=usecond();
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
@ -252,216 +299,51 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)recv_buf_extract_mpi,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,5 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
std::vector<std::pair<int,int> > Cshift_table;
deviceVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid);

View File

@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/PaddedCell.h>

View File

@ -257,18 +257,68 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
});
}
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpy_norm");
return axpy_norm_fast(ret,a,x,y);
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpby_norm");
return axpby_norm_fast(ret,a,b,x,y);
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
}
/// Trace product
template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
-> Lattice<decltype(trace(obj()))>
{
typedef decltype(trace(obj())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( rhs2 , rhs_2, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
-> Lattice<decltype(trace(obj1()))>
{
typedef decltype(trace(obj1())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
-> Lattice<decltype(trace(obj1()))>
{
return traceProduct(rhs_1,rhs_2);
}
NAMESPACE_END(Grid);
#endif

View File

@ -234,10 +234,23 @@ public:
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp);
});
#else
auto me = View(CpuWrite);
thread_for(ss,me.size(),{
me[ss]= r;
});
me[ss]= r;
});
#endif
me.ViewClose();
return *this;
}

View File

@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite));
h_basis_v[k] = basis[k].View(AcceleratorWrite);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
}
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
View *basis_vp = &d_basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot);
deviceVector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm);
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
h_Qt_jv[i]=Qt(j,k);
});
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
}
// Extract a single rotated vector
@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size());
hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead));
h_basis_v[k]=basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
vobj zz=Zero();
deviceVector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
auto basis_vp=& d_basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero();
@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
}
template<class Field>

View File

@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int
}
}
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site));
// assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx);
}
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return;
};
template<class vobj,class sobj>
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site));
// assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);

View File

@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
#include <Grid/lattice/Lattice_slicesum_core.h>
NAMESPACE_BEGIN(Grid);
@ -45,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
@ -74,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
@ -263,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites();
// Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
});
}
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
deviceVector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
@ -294,18 +279,57 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
});
}
#endif
// This is in single precision and fails some tests
auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm;
return nrm;
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// Hack
// Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
#endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm);
ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@ -341,20 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
@ -362,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@ -374,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right);
typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2);
std::vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
@ -384,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
deviceVector<inner_t> inner_tmp(sites);
deviceVector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
@ -435,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
@ -457,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
std::vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
@ -469,19 +519,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
int ostride=grid->_ostride[orthogdim];
//Reduce Data down to lvSum
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
@ -515,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
}
template<class vobj> inline
std::vector<typename vobj::scalar_object>
@ -525,6 +568,20 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result;
}
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
@ -544,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@ -675,203 +732,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
std::vector<int> latt_phys(NN-1);
Coordinate simd_phys;
std::vector<int> mpi_phys(NN-1);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
mpi_phys[dd] =BlockSolverGrid->_processors[d];
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
GridBase *FullGrid = X.Grid();
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
RR = R; // Copies checkerboard for insert
typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
for(int i=0;i<Nslice;i++){
ExtractSlice(Ys,Y,i,Orthog);
ExtractSlice(Rs,R,i,Orthog);
Rs=Ys;
for(int j=0;j<Nslice;j++){
ExtractSlice(Xs,X,j,Orthog);
Rs = Rs + Xs*(scale*aa(j,i));
}
InsertSlice(Rs,RR,i,Orthog);
}
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
R=Zero();
sliceMaddMatrix(R,aa,X,R,Orthog,scale);
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
}
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
for(int s=0;s<Nslice;s++){
ExtractSlice(ls,lhs,s,Orthog);
for(int ss=0;ss<Nslice;ss++){
ExtractSlice(rs,rhs,ss,Orthog);
mat(s,ss) = innerProduct(ls,rs);
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
delete SliceGrid;
}
NAMESPACE_END(Grid);

View File

@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
auto discard=hipGetDevice(&device);
auto r=hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize;
@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
deviceVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result;
}
@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites);
deviceVector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];

View File

@ -4,29 +4,28 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity);
sobj ret ;
sobj identity; zeroit(identity);
sobj ret; zeroit(ret);
Integer nsimd= vobj::Nsimd();
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
{
sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(sycl::range<1>{osites},
Reduction,
[=] (sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
}
sobjD dret; convertType(dret,ret);
return dret;
}
@ -69,57 +68,25 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result;
}
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index];
});
});
}
theGridAccelerator->wait();
return ret;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@ -152,6 +152,7 @@ public:
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
#if 0
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
@ -162,9 +163,9 @@ public:
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
@ -179,6 +180,9 @@ public:
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
#else
eng.discardhi(site);
#endif
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
@ -412,7 +416,7 @@ public:
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@ -429,7 +433,6 @@ public:
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
#if 1
thread_for( lidx, _grid->lSites(), {
int64_t gidx;
@ -450,29 +453,12 @@ public:
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
});
#else
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
if ( britney ) {
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
} else {
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
#endif
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient

View File

@ -0,0 +1,267 @@
#pragma once
#if defined(GRID_CUDA)
#include <cub/cub.cuh>
#define gpucub cub
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#elif defined(GRID_HIP)
#include <hipcub/hipcub.hpp>
#define gpucub hipcub
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#endif
NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj>
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2;
deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
vobj zero_init;
zeroit(zero_init);
void *temp_storage_array = NULL;
size_t temp_storage_bytes = 0;
vobj *d_out;
int* d_offsets;
std::vector<int> offsets(rd+1,0);
for (int i = 0; i < offsets.size(); i++) {
offsets[i] = i*subvol_size;
}
//Allocate memory for output and offset arrays on device
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
//allocate memory for temp_storage_array
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
//prepare buffer for reduction
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
//issue segmented reductions in computeStream
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();
acceleratorFreeDevice(temp_storage_array);
acceleratorFreeDevice(d_out);
acceleratorFreeDevice(d_offsets);
}
#endif
#if defined(GRID_SYCL)
template<class vobj>
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{
size_t subvol_size = e1*e2;
vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
vobj vobj_zero;
zeroit(vobj_zero);
for (int r = 0; r<rd; r++) {
mysum[r] = vobj_zero;
}
deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
// autoView(Data_v, Data, AcceleratorRead);
//prepare reduction buffer
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction,
[=](sycl::id<1> item, auto &sum) {
auto s = item[0];
sum += rb_p[r*subvol_size+s];
});
});
}
theGridAccelerator->wait();
for (int r = 0; r < rd; r++) {
lvSum[r] = mysum[r];
}
free(mysum,*theGridAccelerator);
}
#endif
template<class vobj>
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2;
deviceVector<vector>buffer(osites);
vector *dat = (vector *)Data;
vector *buf = &buffer[0];
std::vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
#elif defined(GRID_SYCL)
sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
#endif
for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r];
}
}
}
template<class vobj>
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#elif defined (GRID_SYCL)
sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
else {
sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
}
template<class vobj>
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*ostride; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
}
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
NAMESPACE_END(Grid);

View File

@ -981,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice;
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++];
hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
}
}
peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor);
@ -1003,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
assert(orthog<nh);
assert(orthog>=0);
assert(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0;
for(int d=0;d<nh;d++){
@ -1020,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0;
hcoor[orthog] = slice;
int ddl=0;
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++];
hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
}
}
peekLocalSite(s,higherDimv,hcoor);

View File

@ -45,6 +45,7 @@ public:
};
// Host only
GridBase * getGrid(void) const { return _grid; };
vobj* getHostPointer(void) const { return _odata; };
};
/////////////////////////////////////////////////////////////////////////////////////////

View File

@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
*
*/
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
Lattice<vobj> &lat,
int x,
int dim,
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
});
}
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
const Lattice<vobj> &lat,
int x,
int dim,
@ -462,13 +462,19 @@ public:
int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static cshiftVector<vobj> send_buf;
static cshiftVector<vobj> recv_buf;
static deviceVector<vobj> send_buf;
static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<CommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req;
std::vector<MpiCommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size;
int bytes = words * sizeof(vobj);
@ -495,9 +501,16 @@ public:
t_gather+=usecond()-t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +521,16 @@ public:
t_gather+= usecond() - t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t;
}
@ -533,8 +553,13 @@ public:
t=usecond();
grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@ -543,6 +568,11 @@ public:
t=usecond();
grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();

View File

@ -179,11 +179,11 @@ extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogDebug;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogIterative;
extern GridLogger GridLogIntegrator;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
@ -191,6 +191,41 @@ extern Colours GridLogColours;
std::string demangle(const char* name) ;
template<typename... Args>
inline std::string sjoin(Args&&... args) noexcept {
std::ostringstream msg;
(msg << ... << args);
return msg.str();
}
/*! @brief make log messages work like python print */
template <typename... Args>
inline void Grid_log(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << GridLogMessage << msg << std::endl;
}
/*! @brief make warning messages work like python print */
template <typename... Args>
inline void Grid_warn(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
}
/*! @brief make error messages work like python print */
template <typename... Args>
inline void Grid_error(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
}
/*! @brief make pass messages work like python print */
template <typename... Args>
inline void Grid_pass(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
}
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@ -34,7 +34,7 @@ class GridTracer {
};
inline void tracePush(const char *name) { roctxRangePushA(name); }
inline void tracePop(const char *name) { roctxRangePop(); }
inline int traceStart(const char *name) { roctxRangeStart(name); }
inline int traceStart(const char *name) { return roctxRangeStart(name); }
inline void traceStop(int ID) { roctxRangeStop(ID); }
#endif

View File

@ -98,7 +98,7 @@ public:
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
/////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container
/////////////////////////////////////////////////////////////
@ -129,6 +129,26 @@ public:
virtual ~Action(){}
};
template <class GaugeField >
class EmptyAction : public Action <GaugeField>
{
using Action<GaugeField>::refresh;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() { return std::string("Level Force Log"); };
virtual std::string LogParameters() { return std::string("No parameters");};
};
NAMESPACE_END(Grid);
#endif // ACTION_BASE_H

View File

@ -55,6 +55,11 @@ public:
RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant
// Device resident
deviceVector<Coeff_t> d_shift_coefficients;
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
virtual void Instantiatable(void) = 0;
// EOFA-specific operations
@ -92,6 +97,11 @@ public:
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
d_shift_coefficients.resize(Ls);
d_MooeeInv_shift_lc.resize(Ls);
d_MooeeInv_shift_norm.resize(Ls);
};
};

View File

@ -90,16 +90,16 @@ public:
void M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
Vector<Coeff_t> &lower,
Vector<Coeff_t> &diag,
Vector<Coeff_t> &upper);
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper);
void M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
Vector<Coeff_t> &lower,
Vector<Coeff_t> &diag,
Vector<Coeff_t> &upper);
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper);
virtual void Instantiatable(void)=0;
@ -119,35 +119,51 @@ public:
RealD mass_plus, mass_minus;
// Save arguments to SetCoefficientsInternal
Vector<Coeff_t> _gamma;
std::vector<Coeff_t> _gamma;
RealD _zolo_hi;
RealD _b;
RealD _c;
// possible boost
std::vector<ComplexD> qmu;
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
// Cayley form Moebius (tanh and zolotarev)
Vector<Coeff_t> omega;
Vector<Coeff_t> bs; // S dependent coeffs
Vector<Coeff_t> cs;
Vector<Coeff_t> as;
std::vector<Coeff_t> omega;
std::vector<Coeff_t> bs; // S dependent coeffs
std::vector<Coeff_t> cs;
std::vector<Coeff_t> as;
// For preconditioning Cayley form
Vector<Coeff_t> bee;
Vector<Coeff_t> cee;
Vector<Coeff_t> aee;
Vector<Coeff_t> beo;
Vector<Coeff_t> ceo;
Vector<Coeff_t> aeo;
std::vector<Coeff_t> bee;
std::vector<Coeff_t> cee;
std::vector<Coeff_t> aee;
std::vector<Coeff_t> beo;
std::vector<Coeff_t> ceo;
std::vector<Coeff_t> aeo;
// LDU factorisation of the eeoo matrix
Vector<Coeff_t> lee;
Vector<Coeff_t> leem;
Vector<Coeff_t> uee;
Vector<Coeff_t> ueem;
Vector<Coeff_t> dee;
std::vector<Coeff_t> lee;
std::vector<Coeff_t> leem;
std::vector<Coeff_t> uee;
std::vector<Coeff_t> ueem;
std::vector<Coeff_t> dee;
// Device memory
deviceVector<Coeff_t> d_diag;
deviceVector<Coeff_t> d_upper;
deviceVector<Coeff_t> d_lower;
deviceVector<Coeff_t> d_lee;
deviceVector<Coeff_t> d_dee;
deviceVector<Coeff_t> d_uee;
deviceVector<Coeff_t> d_leem;
deviceVector<Coeff_t> d_ueem;
// Matrices of 5d ee inverse params
Vector<iSinglet<Simd> > MatpInv;
Vector<iSinglet<Simd> > MatmInv;
Vector<iSinglet<Simd> > MatpInvDag;
Vector<iSinglet<Simd> > MatmInvDag;
// std::vector<iSinglet<Simd> > MatpInv;
// std::vector<iSinglet<Simd> > MatmInv;
// std::vector<iSinglet<Simd> > MatpInvDag;
// std::vector<iSinglet<Simd> > MatmInvDag;
///////////////////////////////////////////////////////////////
// Conserved current utilities
@ -187,7 +203,7 @@ public:
protected:
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,196 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Copyright (C) 2020 - 2025
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion5D<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
int n_rhs;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@ -60,6 +60,50 @@ public:
// virtual void Instantiatable(void)=0;
virtual void Instantiatable(void) =0;
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
// Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
@ -90,12 +134,12 @@ protected:
RealD mass;
RealD R;
RealD ZoloHiInv;
Vector<double> Beta;
Vector<double> cc;;
Vector<double> cc_d;;
Vector<double> sqrt_cc;
Vector<double> See;
Vector<double> Aee;
std::vector<double> Beta;
std::vector<double> cc;;
std::vector<double> cc_d;;
std::vector<double> sqrt_cc;
std::vector<double> See;
std::vector<double> Aee;
};

View File

@ -69,10 +69,10 @@ public:
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
virtual void RefreshShiftCoefficients(RealD new_shift);
@ -83,7 +83,7 @@ public:
RealD _M5, const ImplParams& p=ImplParams());
protected:
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
};
NAMESPACE_END(Grid);

View File

@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;

View File

@ -102,11 +102,11 @@ public:
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
@ -164,8 +164,6 @@ public:
DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities

View File

@ -100,7 +100,6 @@ public:
int dag);
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -108,7 +107,6 @@ public:
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -116,7 +114,6 @@ public:
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -192,8 +189,6 @@ public:
DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@ -42,11 +42,11 @@ public:
public:
// Shift operator coefficients for red-black preconditioned Mobius EOFA
Vector<Coeff_t> Mooee_shift;
Vector<Coeff_t> MooeeInv_shift_lc;
Vector<Coeff_t> MooeeInv_shift_norm;
Vector<Coeff_t> MooeeInvDag_shift_lc;
Vector<Coeff_t> MooeeInvDag_shift_norm;
std::vector<Coeff_t> Mooee_shift;
std::vector<Coeff_t> MooeeInv_shift_lc;
std::vector<Coeff_t> MooeeInv_shift_norm;
std::vector<Coeff_t> MooeeInvDag_shift_lc;
std::vector<Coeff_t> MooeeInvDag_shift_norm;
virtual void Instantiatable(void) {};
@ -74,18 +74,18 @@ public:
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
Vector<Coeff_t>& shift_coeffs);
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
Vector<Coeff_t>& shift_coeffs);
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs);
virtual void RefreshShiftCoefficients(RealD new_shift);

View File

@ -102,11 +102,11 @@ public:
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
@ -152,9 +152,6 @@ public:
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////

View File

@ -42,7 +42,7 @@ public:
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
};
// Constructors
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,

View File

@ -41,6 +41,10 @@ public:
public:
// Constructors
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -41,6 +41,9 @@ public:
public:
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -40,6 +40,9 @@ public:
INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -41,6 +41,9 @@ public:
public:
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -40,6 +40,11 @@ public:
INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
public:
INHERIT_IMPL_TYPES(Impl);
const int part_frac_chroma_convention=1;
const int part_frac_chroma_convention=0;
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
@ -83,19 +83,78 @@ public:
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
if ( this->qmu.size() ){
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
} else {
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
}
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
protected:
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
std::vector<RealD> qmu;
// Part frac
RealD mass;
RealD dw_diag;
RealD R;
RealD amax;
RealD scale;
Vector<double> p;
Vector<double> q;
std::vector<double> p;
std::vector<double> q;
};

View File

@ -35,7 +35,7 @@ template<class Matrix, class Field>
class KappaSimilarityTransform {
public:
INHERIT_IMPL_TYPES(Matrix);
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
KappaSimilarityTransform (Matrix &zmob) {
for (int i=0;i<(int)zmob.bs.size();i++) {

View File

@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
public:
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
void DhopImproved(StencilImpl &st,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
void DhopNaive(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);

View File

@ -47,7 +47,7 @@ public:
static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
@ -109,7 +109,7 @@ public:
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
@ -197,7 +197,7 @@ public:
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
@ -208,7 +208,7 @@ public:
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
@ -402,7 +402,6 @@ public:
typedef CartesianStencil<vobj,cobj,Parameters> Base;
typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector;
// Vector<int> surface_list;
WilsonStencil(GridBase *grid,
@ -415,29 +414,6 @@ public:
// surface_list.resize(0);
this->same_node.resize(npoints);
};
/*
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
this->same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
*/
template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
@ -508,6 +484,11 @@ public:
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
#ifdef NVLINK_GET
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
}
};

View File

@ -126,14 +126,17 @@ public:
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
void DhopInternal(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
// Constructor
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
@ -168,9 +171,6 @@ public:
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
WilsonAnisotropyCoefficients anisotropyCoeff;
///////////////////////////////////////////////////////////////

View File

@ -91,13 +91,13 @@ public:
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
@ -109,6 +109,8 @@ public:
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
std::vector<double> qmu) ;
// Implement hopping term non-hermitian hopping term; half cb or both
// Implement s-diagonal DW
@ -117,6 +119,9 @@ public:
void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag);
void DhopComms (const FermionField &in, FermionField &out);
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
// add a DhopComm
// -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
@ -135,21 +140,18 @@ public:
int dag);
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
@ -203,9 +205,6 @@ public:
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@ -57,6 +57,10 @@ public:
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids);
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;

View File

@ -63,7 +63,9 @@ public:
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
virtual void M(const FermionField &in, FermionField &out) ;
virtual void Mdag(const FermionField &in, FermionField &out) ;
private:
RealD mu; // TwistedMass parameter

View File

@ -58,7 +58,7 @@ public:
{
// RealD eps = 1.0;
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
Vector<Coeff_t> zgamma(this->Ls);
std::vector<Coeff_t> zgamma(this->Ls);
for(int s=0;s<this->Ls;s++){
zgamma[s] = gamma[s];
}

Some files were not shown because too many files have changed in this diff Show More