1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-11-03 05:24:32 +00:00

Compare commits

..

314 Commits

Author SHA1 Message Date
6e40e22004 aarch64 libunwind compatibility fix 2025-10-30 16:45:43 +01:00
Peter Boyle
6165931afa Update GridStd.h 2025-10-03 14:35:37 -04:00
23581333e6 link cufft 2025-08-21 22:25:55 +01:00
e5fa3d887f Compile on CUDA 2025-08-21 22:10:27 +01:00
583fa7bb0a FFTW guarded after CUDA adn HIP 2025-08-21 22:00:12 +01:00
Peter Boyle
fe0db53842 FFT offload to GPU and MUCH faster comms.
40x speed up on Frontier
2025-08-21 16:45:38 -04:00
Peter Boyle
76c0ada1e1 Benchmark for En Hung 2025-08-21 16:45:38 -04:00
Peter Boyle
92f49e9194 Merge pull request #482 from g-simonetti/wflow_sp2n_paboyle
Fixed Wilson flow for Nc not equal to 3
2025-08-21 09:10:25 -04:00
Peter Boyle
44c8057b5f Merge pull request #481 from vataspro/sp-reps-fix
Only compile higher fermion representations for symplectic gauge group when requested via configure flag
2025-08-20 12:57:28 -04:00
Alexis Provatas
0ad837f595 Fix Sp representations compilation 2025-08-20 17:48:39 +01:00
Peter Boyle
bd2103c746 Merge pull request #480 from vataspro/fix-no-comms
Fix enable-comms=none
2025-08-20 12:26:47 -04:00
Alexis Provatas
9c18d2ddb0 Fix StencilSendToRecvFromBegin to agree with base 2025-08-20 17:17:06 +01:00
g-simonetti
1245a8c151 num_colours added to class S 2025-08-20 16:27:34 +01:00
g-simonetti
07113dc8ba Changed beta=3 to beta=Nc with comments 2025-08-20 16:18:34 +01:00
a3420e6fa9 Update for grid view logging 2025-08-14 21:29:20 +00:00
732836d9f8 Missed one 2025-08-14 20:25:54 +00:00
87658f7b53 ASSERT tripped in Shuhei's branch 2025-08-14 20:08:54 +00:00
e7f51e5fb1 Timer pointers for hadrons compat.
Reluctantly, this interface is silly to pass timers around.
2025-08-11 21:11:36 +01:00
Peter Boyle
1ce5f70dd1 Update GridStd.h 2025-08-11 12:20:54 -04:00
Peter Boyle
473635f401 Update BinaryIO.h 2025-08-11 11:06:06 -04:00
5adf2657dd Updated to compile and run fast on CUDA 2025-08-10 00:00:13 +01:00
82cfff2990 A2A meson field BLAS based momentum project 2025-08-07 15:51:15 +00:00
4397b1c442 Debugged momentum projection for A2A Meson Field 2025-08-07 15:51:01 +00:00
9e6a4a4737 Assertion updates to macros (mostly) with backtrace.
WIlson flow to include options for DBW2, Iwasaki, Symanzik.
View logging for data assurance
2025-08-07 15:48:38 +00:00
41f344bbd3 Merge with Christoph GPT checksum debug 2025-07-15 03:06:09 +00:00
a77cd50b2f Update comms logging in Cshift 2025-07-11 14:36:10 +00:00
73af020f98 improved 2025-06-27 06:08:54 +00:00
bffb83c46e std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;

--dylib-map : Grid prints its dylib regions
--heartbeat : itimer based / SIGALRM wake up which seems to make Aurora
more stable
--debug-heartbeat : periodically report to stderr where we are in code

Now have libunwind option (configure: --with-unwind=<prefix>) to give an
Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.
2025-06-27 06:08:54 +00:00
7031f37350 Use libunwind for backtrace as it is signal asynch safe 2025-06-27 06:08:54 +00:00
829dd74cb2 Verbose change 2025-06-27 06:08:54 +00:00
66e671985d P2P 2025-06-27 06:08:54 +00:00
5afcbcf0f3 Cshift uses flight recorder 2025-06-27 06:08:54 +00:00
9730579312 Simplify and verbose 2025-06-27 06:08:51 +00:00
bfae14d035 More flight logging 2025-06-27 06:07:34 +00:00
b78fc73d19 Better signal handler 2025-06-27 06:07:34 +00:00
Peter Boyle
709f8ae76c Update README 2025-06-26 23:06:11 -04:00
Peter Boyle
7aa06329d0 Update for new stencil compression options 2025-06-17 18:06:19 +02:00
Peter Boyle
9d6a38c44c Compressed comms options as Sloppy 2025-06-17 16:43:53 +02:00
Peter Boyle
6ec5cee368 Preparing for compressed comms 2025-06-17 16:38:10 +02:00
Peter Boyle
f2e9a68825 Simplify 2025-06-13 17:32:05 +02:00
Peter Boyle
d88750e6b6 Sloppy + non-sloppy 2025-06-13 16:42:01 +02:00
Peter Boyle
821358eda7 Remove partial dirichlet. Favour intro reduced prec comms options 2025-06-13 05:08:45 +02:00
Peter Boyle
fce6e1f135 Kill core files for quota reasons 2025-06-13 05:08:15 +02:00
Peter Boyle
8f0bb3e676 remove partial dirichlet 2025-06-13 05:07:56 +02:00
Peter Boyle
262c70d967 USe sloppy comms options 2025-06-13 05:07:23 +02:00
Peter Boyle
da43ef7c2d REmove partial dirichlet option. It's going nowhere 2025-06-13 05:05:15 +02:00
Peter Boyle
7b60ab5df1 Warning suppress 2025-06-13 05:04:55 +02:00
Peter Boyle
f6b961a64e Warning suppress 2025-06-13 05:04:47 +02:00
Peter Boyle
f1ed988aa3 Interface to reduced precision comms 2025-06-13 05:04:12 +02:00
Peter Boyle
eea51bb604 Suppress annoying warns 2025-06-13 05:03:36 +02:00
Peter Boyle
9203126aa5 Scripts 2025-06-11 15:30:16 +02:00
Peter Boyle
f90ba4712a Update for Jupiter 2025-06-11 15:24:34 +02:00
Peter Boyle
3737a24096 Updated python output 2025-06-03 14:09:29 -04:00
d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
Peter Boyle
dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
Peter Boyle
5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
Peter Boyle
2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
Peter Boyle
cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
Peter Boyle
b517e88db3 Update README 2025-05-13 16:49:21 -04:00
bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
644cc6647e JSON update 2025-05-13 12:50:58 +00:00
72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
Peter Boyle
d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
Peter Boyle
bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
Peter Boyle
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
Peter Boyle
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
Peter Boyle
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
Peter Boyle
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
Peter Boyle
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
Peter Boyle
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
Peter Boyle
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
Peter Boyle
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
Peter Boyle
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
Peter Boyle
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
Peter Boyle
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
Peter Boyle
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
Peter Boyle
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
Chulwoo Jung
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
Chulwoo Jung
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
Christoph Lehner
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
Christoph Lehner
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
Peter Boyle
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
Peter Boyle
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
Peter Boyle
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
Peter Boyle
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
Peter Boyle
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
Peter Boyle
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
Peter Boyle
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
Peter Boyle
3811d19298 Fence 2025-04-04 18:35:06 -04:00
Peter Boyle
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
Peter Boyle
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
Peter Boyle
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
Peter Boyle
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
Peter Boyle
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
Peter Boyle
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
Peter Boyle
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
Peter Boyle
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
Mashy Green
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
Mashy Green
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
Peter Boyle
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
Christoph Lehner
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
Christoph Lehner
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
Christoph Lehner
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
Mashy Green
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
Mashy Green
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
Mashy Green
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
Christoph Lehner
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
Peter Boyle
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
Peter Boyle
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
Peter Boyle
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
Peter Boyle
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
Peter Boyle
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
Peter Boyle
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
Peter Boyle
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
Peter Boyle
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
Peter Boyle
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
Peter Boyle
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
Muhammad Asif
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
Muhammad Asif
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
Mashy Green
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
Mashy Green
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
Mashy Green
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
Mashy Green
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
Mashy Green
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
Mashy Green
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
Peter Boyle
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
Christoph Lehner
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
Mashy Green
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
Mashy Green
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
Chulwoo Jung
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
Chulwoo Jung
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
Peter Boyle
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
Chulwoo Jung
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
Chulwoo Jung
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
Mashy Green
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
Mashy Green
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
Mashy Green
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
Mashy Green
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
Peter Boyle
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
Peter Boyle
b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
Peter Boyle
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
Peter Boyle
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
Peter Boyle
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
Peter Boyle
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
Peter Boyle
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
Peter Boyle
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
Peter Boyle
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
Peter Boyle
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
Peter Boyle
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
Peter Boyle
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
Peter Boyle
5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
Peter Boyle
4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
Peter Boyle
955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
Peter Boyle
11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
Peter Boyle
8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
Peter Boyle
ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
Peter Boyle
6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
Peter Boyle
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
Peter Boyle
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
Peter Boyle
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
Peter Boyle
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
Peter Boyle
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
Peter Boyle
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
Peter Boyle
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
Peter Boyle
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
066544281f Deprecate UVM 2024-09-17 13:34:27 +00:00
11be10d2c0 Aurora testing 2024-09-10 18:11:52 +00:00
160969a758 UVM tester, doesn't turn up anything 2024-09-10 18:09:42 +00:00
622f78ebea SYCL updates -- operator = giving trouble on Aurora.
SYCL reduction is failing intermittently with SVM interface - returns
zero, expect non-zero.
Think I need to remove ALL dependence on SVM.
2024-09-04 13:53:48 +00:00
Peter Boyle
aa67a5b095 Rename 2024-08-27 19:54:01 +00:00
Peter Boyle
af9ea0864c Blas fix 2024-08-27 19:53:09 +00:00
Peter Boyle
4e2a6d87c4 Gemm batched fix 2024-08-27 19:24:05 +00:00
Peter Boyle
a465ecece9 Aurora 2024-08-27 19:20:43 +00:00
Peter Boyle
575eb72182 Converges on 16^3 2024-08-27 19:20:38 +00:00
Peter Boyle
3a973914d6 Compile on frontier 2024-08-27 14:55:42 -04:00
Peter Boyle
f568c07bbd Improved the BLAS benchmark 2024-08-27 14:53:54 -04:00
Peter Boyle
2c9878fc3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-27 12:05:46 -04:00
Peter Boyle
27b1b1b005 Checkerboard available for offloading pickCheckerboard 2024-08-27 12:04:09 -04:00
Peter Boyle
130d7ab077 Verbose changes 2024-08-27 12:03:28 -04:00
Peter Boyle
29f6b8a74a Setup 2024-08-27 12:02:49 -04:00
Peter Boyle
9779aaea33 16^3 optimise 2024-08-27 11:38:35 -04:00
Peter Boyle
ec25604a67 Fastest solver for mrhs multigrid 2024-08-27 11:32:34 -04:00
Peter Boyle
3668e81c5e Extract slice working on checkerboard field for Block Lanczos 2024-08-27 11:31:30 -04:00
Peter Boyle
d66b2423cb Move slice operations to GPU for BlockCG 2024-08-27 11:28:47 -04:00
Peter Boyle
15cc78f0b6 peek/poke local site on checkerboard arrays 2024-08-27 11:23:42 -04:00
Peter Boyle
06db4ddea2 Fast init on GPU 2024-08-27 11:22:33 -04:00
Peter Boyle
6cfb90e99f Support needed for accelerator resident set/pick Checkerboard 2024-08-27 11:19:00 -04:00
Peter Boyle
d8be95a2a3 Don't early terminate power method to get more accurate top EV 2024-08-27 11:17:37 -04:00
Peter Boyle
f82702872d Normal residual 2024-08-27 11:16:44 -04:00
Peter Boyle
3752c49ef0 Add option to record the CG polynomial 2024-08-27 11:14:35 -04:00
Peter Boyle
fe65fa4988 MulMatrix 2024-08-27 11:13:18 -04:00
Peter Boyle
1fe4c205a3 Adef 2024-08-27 11:11:47 -04:00
Peter Boyle
d4dc5e0f43 BlockCG linalg acceleratoin with BLAS 2024-08-27 11:08:33 -04:00
Peter Boyle
77944437ce Functor initialisation 2024-08-27 11:01:02 -04:00
Peter Boyle
c164bff758 MMdag 2024-08-27 11:00:36 -04:00
Peter Boyle
aa2e3d954a MMdag operator 2024-08-27 10:59:29 -04:00
Peter Boyle
de62b04728 Block CG linalg acceleration 2024-08-27 10:58:54 -04:00
Peter Boyle
d0bdb50f24 Analyse power spectrum 2024-08-27 10:58:19 -04:00
Peter Boyle
a8fecbc609 BlockCG linalg via BLAS 2024-08-21 16:08:16 -04:00
Peter Boyle
557fa483ff Blas benchmark committed stand alone 2024-08-20 16:18:43 +00:00
Peter Boyle
fc15d55df6 Mallinfo 2024-08-20 14:33:09 +00:00
Peter Boyle
53573d7d94 Better benchmark 2024-08-20 14:31:57 +00:00
Peter Boyle
bb3c177000 Better benchmarking 2024-08-20 14:31:41 +00:00
Peter Boyle
a3322b470f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-20 14:30:52 +00:00
Peter Boyle
f8f408e7a9 BLAS everywhere 2024-07-25 18:09:02 +00:00
Peter Boyle
baac1127d0 Later intel compiler happiness 2024-07-25 18:06:05 +00:00
Peter Boyle
6f1328160c Remove SVM use 2024-07-25 18:05:40 +00:00
Peter Boyle
04cf902791 Mallinfo and ASAN hooks 2024-07-25 18:04:56 +00:00
Peter Boyle
7a5b1c1a19 Try Catch convenience macro 2024-07-25 18:03:41 +00:00
Peter Boyle
18d2d7da4a Eigen implementation and SYCL implementation 2024-07-25 18:02:56 +00:00
Peter Boyle
b461184797 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-23 09:53:58 -04:00
Peter Boyle
4563b39305 New Frontier config 2024-07-23 09:53:08 -04:00
Peter Boyle
c9d5674d5b FInal for paper 2024-07-22 15:26:45 -04:00
Peter Boyle
486412635a 8^4 test for PETSc 2024-07-22 15:25:17 -04:00
Peter Boyle
8b23a1546a Force compile temporarily 2024-07-22 15:24:56 -04:00
Peter Boyle
a901e4e369 Regressed performance for paper 2024-07-22 15:24:04 -04:00
Peter Boyle
804d9367d4 Regressed performance 2024-07-22 15:23:25 -04:00
Peter Boyle
41d8adca95 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-07-11 15:38:45 +00:00
Peter Boyle
059e8e5bb0 New compile option 2024-07-11 15:37:30 +00:00
Peter Boyle
b3ee8ded96 Respect command line 2024-07-11 15:34:48 +00:00
Peter Boyle
cf3584ad15 Convenient to monitor memory across an HMC trajectory 2024-07-11 15:30:32 +00:00
Peter Boyle
a66973163f Device vector not UVM 2024-07-11 15:24:11 +00:00
Peter Boyle
4502a8c8a1 libc malloc heap info dump on Linux 2024-07-11 15:22:18 +00:00
Peter Boyle
9c902e4c2d Batched blas, but not working yet on OneAPI 2024-07-11 15:19:49 +00:00
Peter Boyle
f3eb36adcf Namespace addition 2024-07-11 15:19:19 +00:00
Peter Boyle
7c246606c1 Schur additional case 2024-07-10 22:04:32 +00:00
Peter Boyle
172c75029e Redblack additional case 2024-07-10 22:03:59 +00:00
Peter Boyle
6ae52da571 LLVM leak sanitizer 2024-07-08 15:59:18 +00:00
Peter Boyle
4ee9c68053 Updated compile environment 2024-07-08 15:57:57 +00:00
Peter Boyle
a15b4378a3 Sanitizer preservation of options 2024-07-08 15:57:45 +00:00
Peter Boyle
89fdd7f8dd AOT compilation 2024-07-05 17:47:56 +00:00
Peter Boyle
c328be24b7 Sanitizer compile options 2024-07-05 17:46:43 +00:00
Peter Boyle
a73dc6dbf4 Display linux heap info 2024-06-28 16:05:17 +00:00
Peter Boyle
eee2a2657f Try catch exception wrappers 2024-06-28 16:02:29 +00:00
Peter Boyle
12b8be7cb9 Best so far on 96^3 350 Evecs converged on 4^4 block 2024-06-18 16:31:37 -04:00
Peter Boyle
63c223ea5d Verbose 2024-06-18 03:22:01 +00:00
Peter Boyle
2877fb4a2c More verbose if alloc failure 2024-06-18 03:21:03 +00:00
Peter Boyle
d299c86633 Std::asin,acos 2024-06-11 16:41:23 -04:00
Peter Boyle
6ce52092e8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 15:16:58 -04:00
Peter Boyle
b5926c1d21 Broadcast time info 2024-06-11 15:16:25 -04:00
Peter Boyle
9563238e9b Force initial to identity 2024-06-11 17:51:58 +00:00
Peter Boyle
fb9b1d76ca Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-11 16:48:16 +00:00
Peter Boyle
1739146599 Property to initialise reduction 2024-06-11 16:47:35 +00:00
Peter Boyle
ed20b39ab3 Log files from Frontier benchmark 2024-06-11 11:16:20 -04:00
Peter Boyle
284fc05f15 Protect vs. missing LIME libarary 2024-06-11 11:08:00 -04:00
Peter Boyle
07a07b6fa3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-06-10 15:09:25 -04:00
Peter Boyle
dc80b08969 96^3 test 2024-06-10 15:07:29 -04:00
Peter Boyle
a49a161f8d SYCL update to use buffer on reduction variable 2024-06-08 16:05:18 +00:00
Peter Boyle
a6479ca50f Shuhei's ComputeWilsonFlow main programme 2024-06-05 15:51:11 -04:00
Peter Boyle
0e607a55e7 Updated for 8^4 test 2024-05-26 20:53:05 +00:00
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
Peter Boyle
c4b9f71357 CPU compile ordering is important 2024-05-21 02:22:32 +01:00
Peter Boyle
394e506aea Compile options for tursa update 2024-05-21 02:10:04 +01:00
Peter Boyle
e19b26341b Tursa configure update 2024-05-21 01:14:27 +01:00
Peter Boyle
cfe1b13225 Back out zero change 2024-05-21 01:14:08 +01:00
Peter Boyle
890c5ea1cd Warning disable 2024-05-20 20:08:31 +01:00
Peter Boyle
a87378d3b6 Update 2024-05-20 20:08:31 +01:00
Peter Boyle
e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
Peter Boyle
ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
446 changed files with 19184 additions and 7492 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -0,0 +1,5 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -51,11 +51,13 @@ directory
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored #pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress declared_but_not_referenced
#pragma nv_diag_suppress extra_semicolon #pragma nv_diag_suppress extra_semicolon
#else #else
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type #pragma diag_suppress cast_to_qualified_type
#pragma diag_suppress declared_but_not_referenced
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon #pragma diag_suppress extra_semicolon

View File

@@ -1,9 +1,17 @@
#ifndef GRID_STD_H #ifndef GRID_STD_H
#define GRID_STD_H #define GRID_STD_H
///////////////////
// Grid config
///////////////////
#include "Config.h"
/////////////////// ///////////////////
// Std C++ dependencies // Std C++ dependencies
/////////////////// ///////////////////
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory> #include <memory>
@@ -15,7 +23,9 @@
#include <random> #include <random>
#include <functional> #include <functional>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <strings.h> #include <strings.h>
#include <stdio.h> #include <stdio.h>
#include <signal.h> #include <signal.h>
@@ -23,11 +33,36 @@
#include <sys/time.h> #include <sys/time.h>
#include <chrono> #include <chrono>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
void GridAbort(void);
#define ASSLOG(A) ::write(STDERR_FILENO,A,::strlen(A));
#ifdef HAVE_EXECINFO_H
#define GRID_ASSERT(b) if(!(b)) { \
fflush(stdout); \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE); \
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,STDERR_FILENO); \
GridAbort(); \
};
#else
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
GridAbort(); \
};
#endif
///////////////////
// Grid config
///////////////////
#include "Config.h"
#ifdef TOFU #ifdef TOFU
#undef GRID_COMMS_THREADS #undef GRID_COMMS_THREADS

View File

@@ -68,8 +68,10 @@ if BUILD_FERMION_REPS
endif endif
if BUILD_SP if BUILD_SP
extra_sources+=$(SP_FERMION_FILES) extra_sources+=$(SP_FERMION_FILES)
if BUILD_FERMION_REPS
extra_sources+=$(SP_TWOIND_FERMION_FILES) extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif endif
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a

View File

@@ -29,6 +29,7 @@ directory
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include <exception>
#include <cassert> #include <cassert>
#define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_BEGIN(A) namespace A {
@@ -36,3 +37,7 @@ directory
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid) #define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" ); #define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@@ -50,6 +50,9 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h> #include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h> #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h> #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
// Not really deflation, but useful
#include <Grid/algorithms/blas/MomentumProject.h>
NAMESPACE_CHECK(deflation); NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad); NAMESPACE_CHECK(ConjGrad);

View File

@@ -28,6 +28,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_FFT_H_ #ifndef _GRID_FFT_H_
#define _GRID_FFT_H_ #define _GRID_FFT_H_
#ifdef GRID_CUDA
#include <cufft.h>
#endif
#ifdef GRID_HIP
#include <hipfft/hipfft.h>
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
#if defined(USE_MKL) || defined(GRID_SYCL) #if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h> #include <fftw/fftw3.h>
@@ -35,88 +44,190 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <fftw3.h> #include <fftw3.h>
#endif #endif
#endif #endif
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; #ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#define FFTW_ESTIMATE (0)
#endif
template<class scalar> struct FFTW {
};
#ifdef GRID_HIP
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftDoubleComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
#endif
#ifdef GRID_CUDA
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftDoubleComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
else cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
else cufftExecC2C(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> { template<> struct FFTW<ComplexD> {
public: public:
typedef fftw_complex FFTW_scalar; typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan; typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, FFTW_scalar *in, int *inembed,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out); ::fftw_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p); ::fftw_destroy_plan(p);
} }
}; };
template<> struct FFTW<ComplexF> { template<> struct FFTW<ComplexF> {
public: public:
typedef fftwf_complex FFTW_scalar; typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan; typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, FFTW_scalar *in, int *inembed,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out); ::fftwf_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p); ::fftwf_destroy_plan(p);
} }
}; };
#endif #endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif #endif
class FFT { class FFT {
private: private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops; double flops;
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public: public:
static const int forward=FFTW_FORWARD; static const int forward=FFTW_FORWARD;
@@ -126,31 +237,25 @@ public:
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;} double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) : FFT ( GridCartesian * grid )
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{ {
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
}; };
~FFT ( void) { ~FFT ( void) {
delete sgrid; // delete sgrid;
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid); // vgrid=result.Grid();
conformable(source.Grid(),vgrid); // conformable(result.Grid(),vgrid);
Lattice<vobj> tmp(vgrid); // conformable(source.Grid(),vgrid);
tmp = source; const int Ndim = source.Grid()->Nd();
for(int d=0;d<Nd;d++){ Lattice<vobj> tmp = source;
for(int d=0;d<Ndim;d++){
if( mask[d] ) { if( mask[d] ) {
FFT_dim(result,tmp,d,sign); FFT_dim(result,tmp,d,sign);
tmp=result; tmp=result;
@@ -160,59 +265,70 @@ public:
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1); const int Ndim = source.Grid()->Nd();
Coordinate mask(Ndim,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW const int Ndim = source.Grid()->Nd();
assert(0); GridBase *grid = source.Grid();
#else conformable(result.Grid(),source.Grid());
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim]; int L = grid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = grid->_fdimensions[dim];
Coordinate layout(Nd,1); Coordinate layout(Ndim,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar; typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
Lattice<sobj> pgbuf(&pencil_g); //std::cout << "CPU view" << std::endl;
autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar); int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1; int64_t Nlow = 1;
int64_t Nhigh = 1;
for(int d=0;d<dim;d++){ for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d]; Nlow*=grid->_ldimensions[d];
} }
for(int d=dim+1;d<Ndim;d++){
Nhigh*=grid->_ldimensions[d];
}
int64_t Nperp=Nlow*Nhigh;
deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
pgbuf.resize(Nperp*Ncomp*G);
scalar *pgbuf_v = &pgbuf[0];
int rank = 1; /* 1d transforms */ int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */ int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp; int howmany = Ncomp * Nperp;
int odist,idist,istride,ostride; int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */ idist = odist = G; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ istride = ostride = 1; /* Distance between two elements in the same FT */
int *inembed = n, *onembed = n; int *inembed = n, *onembed = n;
scalar div; scalar div;
if ( sign == backward ) div = 1.0/G; if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else assert(0); else GRID_ASSERT(0);
double t_pencil=0;
double t_fft =0;
double t_total =-usecond();
// std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
/*
*
*/
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -226,68 +342,154 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
Coordinate lcoor(Nd), gcoor(Nd); // std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Ndim), gcoor(Ndim);
double t_copy=0;
double t_shift=0;
t_pencil = -usecond();
result = source; result = source;
int pc = processor_coor[dim]; int pc = grid->_processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
{ const Coordinate ldims = grid->_ldimensions;
autoView(r_v,result,CpuRead); const Coordinate rdims = grid->_rdimensions;
autoView(p_v,pgbuf,CpuWrite); const Coordinate sdims = grid->_simd_layout;
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate processors = grid->_processors;
sobj s; Coordinate pgdims(Ndim);
sgrid->LocalIndexToLocalCoor(idx,cbuf); pgdims[0] = G;
peekLocalSite(s,r_v,cbuf); for(int d=0, dd=1;d<Ndim;d++){
cbuf[dim]+=((pc+p) % processors[dim])*L; if ( d!=dim ) pgdims[dd++] = ldims[d];
pokeLocalSite(s,p_v,cbuf);
});
} }
if (p != processors[dim] - 1) { int64_t pgvol=1;
result = Cshift(result,dim,L); for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
const int Nsimd = vobj::Nsimd();
for(int p=0;p<processors[dim];p++) {
t_copy-=usecond();
autoView(r_v,result,AcceleratorRead);
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
Coordinate icoor;
Coordinate ocoor;
Coordinate pgcoor;
Lexicographic::CoorFromIndex(icoor,lane,sdims);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
} }
} }
// Loop over orthog coords // Map coordinates in lattice layout to FFTW index
int NN=pencil_g.lSites(); int64_t pgidx;
GridStopWatch timer; Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
timer.Start();
thread_for( idx,NN,{ vector_type *from = (vector_type *)&r_v[idx];
Coordinate cbuf(Nd); scalar_type stmp;
pencil_g.LocalIndexToLocalCoor(idx, cbuf); for(int w=0;w<Ncomp;w++){
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 int64_t pg_idx = pgidx + w*pgvol;
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx]; stmp = getlane(from[w], lane);
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx]; pgbuf_v[pg_idx] = stmp;
FFTW<scalar>::fftw_execute_dft(p,in,out);
} }
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
timer.Stop();
t_copy+=usecond();
if (p != processors[dim] - 1) {
Lattice<vobj> temp(grid);
t_shift-=usecond();
temp = Cshift(result,dim,L); result = temp;
t_shift+=usecond();
}
}
t_pencil += usecond();
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
t_fft = -usecond();
FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
t_fft += usecond();
// performance counting // performance counting
double add,mul,fma; flops_call = 5.0*howmany*G*log2(G);
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); usec = t_fft;
flops_call = add+mul+2.0*fma; flops= flops_call;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result result = Zero();
double t_insert = -usecond();
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(r_v,result,AcceleratorWrite);
autoView(result_v,result,CpuWrite); accelerator_for(idx,grid->oSites(),Nsimd,{
thread_for(idx,sgrid->lSites(),{ #ifdef GRID_SIMT
Coordinate clbuf(Nd), cgbuf(Nd); {
sobj s; int lane=acceleratorSIMTlane(Nsimd); // buffer lane
sgrid->LocalIndexToLocalCoor(idx,clbuf); #else
cgbuf = clbuf; for(int lane=0;lane<Nsimd;lane++) {
cgbuf[dim] = clbuf[dim]+L*pc; #endif
peekLocalSite(s,pgbuf_v,cgbuf); Coordinate icoor(Ndim);
pokeLocalSite(s,result_v,clbuf); Coordinate ocoor(Ndim);
Coordinate pgcoor(Ndim);
Lexicographic::CoorFromIndex(icoor,lane,sdims);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
}
}
// Map coordinates in lattice layout to FFTW index
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
vector_type *to = (vector_type *)&r_v[idx];
scalar_type stmp;
for(int w=0;w<Ncomp;w++){
int64_t pg_idx = pgidx + w*pgvol;
stmp = pgbuf_v[pg_idx];
putlane(to[w], stmp, lane);
}
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
} }
result = result*div; result = result*div;
t_insert +=usecond();
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif
t_total +=usecond();
std::cout <<GridLogPerformance<< " FFT took "<<t_total/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT insert "<<t_insert/1.0e6 <<" s" << std::endl;
} }
}; };

View File

@@ -64,7 +64,7 @@ public:
// //
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to // with an GRID_ASSERT trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes // do it, but I fear it required multiple inheritance and mixed in abstract base classes
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -103,6 +103,38 @@ public:
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
} }
}; };
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother // Construct herm op and shift it for mgrid smoother
@@ -116,22 +148,22 @@ public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
assert(0); GRID_ASSERT(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
assert(0); GRID_ASSERT(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0); GRID_ASSERT(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
assert(0); GRID_ASSERT(0);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
assert(0); GRID_ASSERT(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out); HermOp(in,out);
@@ -156,13 +188,13 @@ public:
ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){}; ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
assert(0); GRID_ASSERT(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0); GRID_ASSERT(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0); GRID_ASSERT(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
HermOp(in,out); HermOp(in,out);
@@ -239,10 +271,42 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0); GRID_ASSERT(0);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
assert(0); GRID_ASSERT(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GRID_ASSERT(0);
}
void HermOp(const Field &in, Field &out){
GRID_ASSERT(0);
} }
}; };
@@ -281,13 +345,13 @@ class SchurOperatorBase : public LinearOperatorBase<Field> {
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system GRID_ASSERT(0); // must coarsen the unpreconditioned system
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0); GRID_ASSERT(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0); GRID_ASSERT(0);
}; };
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
@@ -383,10 +447,10 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0); GRID_ASSERT(0);
} }
virtual void HermOp(const Field& in, Field& out) { virtual void HermOp(const Field& in, Field& out) {
assert(0); GRID_ASSERT(0);
} }
void Op(const Field& in, Field& out) { void Op(const Field& in, Field& out) {
Mpc(in, out); Mpc(in, out);
@@ -396,13 +460,13 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) { void OpDiag(const Field& in, Field& out) {
assert(0); // must coarsen the unpreconditioned system GRID_ASSERT(0); // must coarsen the unpreconditioned system
} }
void OpDir(const Field& in, Field& out, int dir, int disp) { void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0); GRID_ASSERT(0);
} }
void OpDirAll(const Field& in, std::vector<Field>& out){ void OpDirAll(const Field& in, std::vector<Field>& out){
assert(0); GRID_ASSERT(0);
}; };
}; };
@@ -516,7 +580,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
public: public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
assert( _Mat.isTrivialEE() ); GRID_ASSERT( _Mat.isTrivialEE() );
mass = _Mat.Mass(); mass = _Mat.Mass();
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -547,7 +611,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out) { virtual void MpcDagMpc(const Field &in, Field &out) {
assert(0);// Never need with staggered GRID_ASSERT(0);// Never need with staggered
} }
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
@@ -559,7 +623,7 @@ template<class Field> class OperatorFunction {
public: public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0; virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) { virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size()); GRID_ASSERT(in.size()==out.size());
for(int k=0;k<in.size();k++){ for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
@@ -573,7 +637,7 @@ public:
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out) virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{ {
assert(in.size() == out.size()); GRID_ASSERT(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i) for (unsigned int i = 0; i < in.size(); ++i)
{ {

View File

@@ -45,6 +45,11 @@ public:
M(in,tmp); M(in,tmp);
Mdag(tmp,out); Mdag(tmp,out);
} }
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;

View File

@@ -59,7 +59,7 @@ public:
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = diff*1.0e-9; RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1; delta*=1.02;
RealD f = approx(x); RealD f = approx(x);
out<< x<<" "<<f<<std::endl; out<< x<<" "<<f<<std::endl;
} }
@@ -131,6 +131,26 @@ public:
Coeffs[j] = s * 2.0/order; Coeffs[j] = s * 2.0/order;
} }
}; };
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){ void JacksonSmooth(void){
@@ -249,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in); axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;

View File

@@ -121,7 +121,7 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
// Reallocate arrays, since degree has changed // Reallocate arrays, since degree has changed
if (num_degree != n || den_degree != d) allocate(num_degree,den_degree); if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
assert(a_len<=SUM_MAX); GRID_ASSERT(a_len<=SUM_MAX);
step = new bigfloat[num_degree+den_degree+2]; step = new bigfloat[num_degree+den_degree+2];
@@ -151,9 +151,9 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Delta too small, try increasing precision\n"; std::cout<<"Delta too small, try increasing precision\n";
assert(0); GRID_ASSERT(0);
}; };
assert( delta>= tolerance); GRID_ASSERT( delta>= tolerance);
search(step); search(step);
} }

View File

@@ -134,7 +134,7 @@ class AlgRemez
virtual ~AlgRemez(); virtual ~AlgRemez();
int getDegree(void){ int getDegree(void){
assert(n==d); GRID_ASSERT(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation

View File

@@ -28,11 +28,11 @@ void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyTy
pow_n = num_degree; pow_n = num_degree;
pow_d = den_degree; pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0); if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) GRID_ASSERT(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0); if(pow_n % 2 == 1 && num_type_in == PolyType::Even) GRID_ASSERT(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0); if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) GRID_ASSERT(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0); if(pow_d % 2 == 1 && den_type_in == PolyType::Even) GRID_ASSERT(0);
num_type = num_type_in; num_type = num_type_in;
den_type = den_type_in; den_type = den_type_in;
@@ -112,9 +112,9 @@ double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degre
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n"; std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
assert(0); GRID_ASSERT(0);
}; };
assert( delta>= tolerance ); GRID_ASSERT( delta>= tolerance );
search(); search();
} }
@@ -278,7 +278,7 @@ void AlgRemezGeneral::equations(){
if(num_pows[j] != -1){ *aa++ = z; t++; } if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x; z *= x;
} }
assert(t == n+1); GRID_ASSERT(t == n+1);
z = (bigfloat)1l; z = (bigfloat)1l;
t = 0; t = 0;
@@ -286,7 +286,7 @@ void AlgRemezGeneral::equations(){
if(den_pows[j] != -1){ *aa++ = -y * z; t++; } if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x; z *= x;
} }
assert(t == d); GRID_ASSERT(t == d);
B[i] = y * z; // Right hand side vector B[i] = y * z; // Right hand side vector
} }

View File

@@ -106,7 +106,7 @@ class AlgRemezGeneral{
bigfloat (*f)(bigfloat x, void *data), void *data); bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{ inline int getDegree(void) const{
assert(n==d); GRID_ASSERT(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation

View File

@@ -74,7 +74,7 @@ bigfloat epsilonMobius(bigfloat x, void* data){
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in, const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){ const RealD lambda_bound){
assert(omega_in.size() == Ls_in); GRID_ASSERT(omega_in.size() == Ls_in);
omega_out.resize(Ls_out); omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial //Use the Remez algorithm to generate the appropriate rational polynomial

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,300 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MomentumProject.h
Copyright (C) 2025
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiMomProject
Import vectors -> nxyz x (ncomponent x nt)
Import complex phases -> nmom x nxy
apply = via (possibly batched) GEMM
*/
template<class Field, class ComplexField>
class MomentumProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
GridBase *grid;
uint64_t nmom;
uint64_t nxyz;
uint64_t nt;
uint64_t nbtw;
uint64_t words;
deviceVector<scalar> BLAS_V; //
deviceVector<scalar> BLAS_M; //
deviceVector<scalar> BLAS_P; //
MomentumProject(){};
~MomentumProject(){ Deallocate(); };
void Deallocate(void)
{
grid=nullptr;
nmom=0;
nxyz=0;
nt=0;
nbtw=0;
words=0;
BLAS_V.resize(0);
BLAS_M.resize(0);
BLAS_P.resize(0);
}
void Allocate(int _nmom,GridBase *_grid)
{
grid=_grid;
Coordinate ldims = grid->LocalDimensions();
nmom=_nmom;
nt = ldims[grid->Nd()-1];
nxyz = grid->lSites()/nt;
words = sizeof(scalar_object)/sizeof(scalar);
nbtw = nt * words;
BLAS_V.resize (nxyz * nt * words );
BLAS_M.resize (nmom * nxyz );
BLAS_P.resize (nmom * nt * words );
}
void ImportMomenta(const std::vector <ComplexField> &momenta)
{
GRID_ASSERT(momenta.size()==nmom);
// might as well just make the momenta here
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_M.size();
GRID_ASSERT(momenta.size()==nmom)
GRID_ASSERT(momenta[0].Grid()==grid);
GRID_ASSERT(sz = nxyz * nmom);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims = grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords = words; // local variable for copy in to GPU
int64_t Nxyz = nxyz;
auto blasData_p = &BLAS_M[0];
for(int m=0;m<momenta.size();m++){
autoView( Data , momenta[m], AcceleratorRead);
auto Data_p = &Data[0];
accelerator_for(xyz,nxyz,1,{
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Lexicographic::CoorFromIndex(lcoor,xyz,ldims);
Coordinate icoor(nd);
Coordinate ocoor(nd);
for (int d = 0; d < nd; d++) {
icoor[d] = lcoor[d]/rdimensions[d];
ocoor[d] = lcoor[d]%rdimensions[d];
}
int64_t osite;
int64_t isite;
Lexicographic::IndexFromCoor(ocoor,osite,rdimensions);
Lexicographic::IndexFromCoor(icoor,isite,simd);
// BLAS_M[nmom][slice_vol]
// Fortran Column major BLAS layout is M_xyz,mom
scalar data = extractLane(isite,Data[osite]);
uint64_t idx = xyz+m*Nxyz;
blasData_p[idx] = data;
});
}
}
void ImportVector(Field &vec)
{
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_V.size();
GRID_ASSERT(sz = nxyz * words * nt);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims= grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords= words; // local variable for copy in to GPU
auto blasData_p = &BLAS_V[0];
autoView( Data , vec, AcceleratorRead);
auto Data_p = &Data[0];
int64_t nwords = words;// for capture
int64_t Nt = nt;// for capture
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Coordinate icoor(nd);
Coordinate ocoor(nd);
Lexicographic::CoorFromIndex(icoor,lane,simd);
Lexicographic::CoorFromIndex(ocoor,sf,rdimensions);
int64_t l_xyz = 0;
for (int d = 0; d < nd; d++) {
lcoor[d] = rdimensions[d]*icoor[d] + ocoor[d];
}
uint64_t l_t = lcoor[nd-1];
Coordinate xyz_coor = lcoor;
xyz_coor[nd-1] =0;
Lexicographic::IndexFromCoor(xyz_coor,l_xyz,ldims);
scalar_object data = extractLane(lane,Data[sf]);
scalar *data_words = (scalar *) &data;
for(int w = 0 ; w < nwords; w++) {
// BLAS_V[slice_vol][nt][words]
// Fortran Column major BLAS layout is V_(t,w)_xyz
uint64_t idx = w+l_t*nwords + l_xyz * nwords * Nt;
blasData_p[idx] = data_words[w];
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
void ExportMomentumProjection(std::vector<typename Field::scalar_object> &projection)
{
projection.resize(nmom*nt);
acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar));
// Could decide on a layout late?
}
// Row major layout "C" order:
// BLAS_V[slice_vol][nt][words]
// BLAS_M[nmom][slice_vol]
// BLAS_P[nmom][nt][words]
//
// Fortran Column major BLAS layout is V_(w,t)_xyz
// Fortran Column major BLAS layout is M_xyz,mom
// Fortran Column major BLAS layout is P_(w,t),mom
//
// Projected
//
// P = (V * M)_(w,t),mom
//
void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata)
{
double t_import=0;
double t_export=0;
double t_gemm =0;
double t_allreduce=0;
t_import-=usecond();
this->ImportVector(data);
std::vector< typename Field::scalar_object > projected_planes;
deviceVector<scalar *> Vd(1);
deviceVector<scalar *> Md(1);
deviceVector<scalar *> Pd(1);
scalar * Vh = & BLAS_V[0];
scalar * Mh = & BLAS_M[0];
scalar * Ph = & BLAS_P[0];
acceleratorPut(Vd[0],Vh);
acceleratorPut(Md[0],Mh);
acceleratorPut(Pd[0],Ph);
t_import+=usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// P_im = VMmx . Vxi
/////////////////////////////////////////
t_gemm-=usecond();
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
words*nt,nmom,nxyz,
scalar(1.0),
Vd,
Md,
scalar(0.0), // wipe out result
Pd);
BLAS.synchronise();
t_gemm+=usecond();
t_export-=usecond();
ExportMomentumProjection(projected_planes); // resizes
t_export+=usecond();
/////////////////////////////////
// Reduce across MPI ranks
/////////////////////////////////
int nd = grid->Nd();
int gt = grid->GlobalDimensions()[nd-1];
int lt = grid->LocalDimensions()[nd-1];
projected_gdata.resize(gt*nmom);
for(int t=0;t<gt*nmom;t++){ // global Nt array with zeroes for stuff not on this node
projected_gdata[t]=Zero();
}
for(int t=0;t<lt;t++){
for(int m=0;m<nmom;m++){
int st = grid->LocalStarts()[nd-1];
projected_gdata[t+st + gt*m] = projected_planes[t+lt*m];
}}
t_allreduce-=usecond();
grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words);
t_allreduce+=usecond();
std::cout << GridLogPerformance<<" MomentumProject t_import "<<t_import<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_export "<<t_export<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_gemm "<<t_gemm<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_reduce "<<t_allreduce<<"us"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -69,8 +69,8 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N) DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N) : evec(_evec), eval(_eval), N(_N)
{ {
assert(evec.size()==eval.size()); GRID_ASSERT(evec.size()==eval.size());
assert(N <= evec.size()); GRID_ASSERT(N <= evec.size());
} }
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
@@ -141,8 +141,7 @@ public:
} }
//postprocessing //postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl; std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++) for (int j=0;j<Nsrc;j++) {
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl; std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace); blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard(); guess[j].Checkerboard() = src[j].Checkerboard();

View File

@@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
GRID_ASSERT(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@@ -131,12 +131,12 @@ public:
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl; // std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
assert(vecs[0].Grid()==fine_grid); GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@@ -164,7 +164,7 @@ public:
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl; // std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl; // std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words); GRID_ASSERT(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{ accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT #ifdef GRID_SIMT
@@ -198,7 +198,7 @@ public:
+ v*bv + v*bv
+ sb; + sb;
// assert(site*lwords<sz); // GRID_ASSERT(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords]; scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
@@ -219,12 +219,12 @@ public:
int nvec = vecs.size(); int nvec = vecs.size();
assert(vecs[0].Grid()==fine_grid); GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@@ -299,7 +299,7 @@ public:
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl; // std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid); GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
@@ -320,7 +320,7 @@ public:
// loop over fine sites // loop over fine sites
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis); GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{ accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT #ifdef GRID_SIMT
@@ -353,7 +353,7 @@ public:
typedef typename vobj::scalar_object coarse_scalar_object; typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl; // std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid); GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
@@ -375,7 +375,7 @@ public:
// loop over fine sites // loop over fine sites
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis); GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{ accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... }); // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
@@ -409,7 +409,7 @@ public:
int nrhs=fine.size(); int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl; // std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
assert(nbasis==_nbasis); GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs ); BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs ); BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -447,10 +447,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw, nbasis,nrhs,vw,
ComplexD(1.0), scalar(1.0),
Vd, Vd,
Fd, Fd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl; // std::cout << "BlockProject done"<<std::endl;
@@ -464,7 +464,7 @@ public:
{ {
int nrhs=fine.size(); int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
assert(nbasis==_nbasis); GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs ); BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs ); BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -497,10 +497,10 @@ public:
int64_t vw = block_vol * words; int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis, vw,nrhs,nbasis,
ComplexD(1.0), scalar(1.0),
Vd, Vd,
Cd, Cd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Fd); Fd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << " blas call done"<<std::endl; // std::cout << " blas call done"<<std::endl;

View File

@@ -98,7 +98,7 @@ public:
void ImportEigenVector(Field &evec,RealD &_eval, int ev) void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{ {
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl; // std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
assert(ev<eval.size()); GRID_ASSERT(ev<eval.size());
eval[ev] = _eval; eval[ev] = _eval;
int64_t offset = ev*vol*words; int64_t offset = ev*vol*words;
@@ -113,7 +113,7 @@ public:
// Could use to import a batch of eigenvectors // Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev) void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{ {
assert(_ev0+_nev<=evec.size()); GRID_ASSERT(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid()); Allocate(_nev,evec[0].Grid());
@@ -126,8 +126,8 @@ public:
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess) void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{ {
int nrhs = source.size(); int nrhs = source.size();
assert(source.size()==guess.size()); GRID_ASSERT(source.size()==guess.size());
assert(grid == guess[0].Grid()); GRID_ASSERT(grid == guess[0].Grid());
conformable(guess[0],source[0]); conformable(guess[0],source[0]);
int64_t vw = vol * words; int64_t vw = vol * words;
@@ -182,14 +182,14 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw, nev,nrhs,vw,
ComplexD(1.0), scalar(1.0),
Ed, Ed,
Rd, Rd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
assert(BLAS_C.size()==nev*nrhs); GRID_ASSERT(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar)); acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
@@ -210,10 +210,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev, vw,nrhs,nev,
ComplexD(1.0), scalar(1.0),
Ed, // x . nev Ed, // x . nev
Cd, // nev . nrhs Cd, // nev . nrhs
ComplexD(0.0), scalar(0.0),
Gd); Gd);
BLAS.synchronise(); BLAS.synchronise();

View File

@@ -270,7 +270,7 @@ class TwoLevelCG : public LinearFunction<Field>
std::vector<RealD> src_nrm(nrhs); std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]); src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0); GRID_ASSERT(src_nrm[rhs]!=0.0);
} }
std::vector<RealD> tn(nrhs); std::vector<RealD> tn(nrhs);

View File

@@ -53,6 +53,7 @@ class TwoLevelCGmrhs
// Fine operator, Smoother, CoarseSolver // Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop; LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother; LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer; GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer; GridStopWatch PromoteTimer;
@@ -62,7 +63,12 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer; GridStopWatch SmoothTimer;
GridStopWatch InsertTimer; GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions // more most opertor functions
TwoLevelCGmrhs(RealD tol, TwoLevelCGmrhs(RealD tol,
Integer maxit, Integer maxit,
@@ -73,12 +79,313 @@ class TwoLevelCGmrhs
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _FineLinop(FineLinop),
_Smoother(Smoother) _Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{ {
grid = fine; grid = fine;
}; };
// Vector case // Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x) virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); GRID_ASSERT(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
GRID_ASSERT(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{ {
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl; std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier(); src[0].Grid()->Barrier();
@@ -108,7 +415,7 @@ class TwoLevelCGmrhs
std::vector<RealD> src_nrm(nrhs); std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]); src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0); GRID_ASSERT(src_nrm[rhs]!=0.0);
} }
std::vector<RealD> tn(nrhs); std::vector<RealD> tn(nrhs);
@@ -361,15 +668,26 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs); CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs); CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
for(int rhs=0;rhs<nrhs;rhs++) { // this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start(); this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]); this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop(); this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start(); this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]); this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop(); this->FineTimer.Stop();
@@ -401,9 +719,11 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min] this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop(); this->PromoteTimer.Stop();
this->FineTimer.Start(); this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
} }
// this->zzz=out[0];
this->FineTimer.Stop(); this->FineTimer.Stop();
} }
}; };

View File

@@ -47,7 +47,7 @@ class BiCGSTAB : public OperatorFunction<Field>
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -77,7 +77,7 @@ class BiCGSTAB : public OperatorFunction<Field>
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
Linop.Op(psi, v); Linop.Op(psi, v);
b = norm2(v); b = norm2(v);
@@ -214,7 +214,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); } if(ErrorOnNoConverge){ GRID_ASSERT(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k; IterationsToComplete = k;
@@ -224,7 +224,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl; std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ assert(0); } if(ErrorOnNoConverge){ GRID_ASSERT(0); }
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };

View File

@@ -31,6 +31,58 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -46,7 +98,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
int Nblock; int Nblock;
BlockCGtype CGtype; BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related // Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R) const std::vector<Field> & R)
{ {
InnerProductMatrix(m_rr,R,R); InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
@@ -131,7 +201,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
} else if (CGtype == CGmultiRHS ) { } else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi); CGmultiRHSsolve(Linop,Src,Psi);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
@@ -139,7 +209,7 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
if ( CGtype == BlockCGrQVec ) { if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi); BlockCGrQsolveVec(Linop,Src,Psi);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -186,12 +256,13 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog); sliceNorm(ssq,B,Orthog);
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog); sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog); sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q; D=Q;
@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
RealD max_resid=0;
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/ */
m_rr = m_C.adjoint() * m_C; m_rr = m_C.adjoint() * m_C;
RealD max_resid=0; max_resid=0;
RealD rrsum=0; RealD rrsum=0;
RealD rr; RealD rr;
@@ -322,9 +398,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
} }
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0); std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -360,10 +438,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog); sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog); sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
// Initial search dir is guess // Initial search dir is guess
Linop.HermOp(Psi, AP); Linop.HermOp(Psi, AP);
@@ -462,47 +540,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
} }
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl; std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation: // BlockCGrQvec implementation:
//-------------------------- //--------------------------
@@ -513,7 +554,7 @@ double normv(const std::vector<Field> &P){
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{ {
Nblock = B.size(); Nblock = B.size();
assert(Nblock == X.size()); GRID_ASSERT(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
@@ -549,13 +590,14 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) { for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]); Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b]; tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
} }
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@@ -688,7 +731,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl; std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }

View File

@@ -36,7 +36,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when CAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -82,7 +82,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -137,7 +137,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -185,7 +185,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }

View File

@@ -38,13 +38,14 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -57,10 +58,22 @@ public:
ErrorOnNoConverge(err_on_no_conv) ErrorOnNoConverge(err_on_no_conv)
{}; {};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient"); GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer; GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start(); PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
@@ -70,14 +83,19 @@ public:
//RealD b_pred; //RealD b_pred;
// Was doing copies // Was doing copies
ConstructTimer.Start();
Field p (src.Grid()); Field p (src.Grid());
Field mmp(src.Grid()); Field mmp(src.Grid());
Field r (src.Grid()); Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up // Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src); ssq = norm2(src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); NormTimer.Stop();
GRID_ASSERT(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) { if ( guess == 0.0 ) {
r = src; r = src;
p = r; p = r;
@@ -89,6 +107,7 @@ public:
a = norm2(p); a = norm2(p);
} }
cp = a; cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src // Handle trivial case of zero src
if (ssq == 0.){ if (ssq == 0.){
@@ -164,6 +183,7 @@ public:
} }
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop(); IterationTimer.Stop();
if ( (k % 500) == 0 ) { if ( (k % 500) == 0 ) {
@@ -202,7 +222,7 @@ public:
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
TrueResidual = true_residual; TrueResidual = true_residual;
@@ -220,6 +240,9 @@ public:
<<" residual "<< std::sqrt(cp / ssq)<< std::endl; <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop(); SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl; std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
@@ -228,10 +251,123 @@ public:
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@@ -77,7 +77,7 @@ public:
} }
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size()); GRID_ASSERT(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size(); int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;

View File

@@ -98,15 +98,15 @@ public:
std::vector<RealD> alpha(nshift,1.0); std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift); GRID_ASSERT(psi.size()==nshift);
assert(mass.size()==nshift); GRID_ASSERT(mass.size()==nshift);
assert(mresidual.size()==nshift); GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // remove dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;
@@ -122,7 +122,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -338,7 +338,7 @@ public:
} }
// ugly hack // ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0); // GRID_ASSERT(0);
} }
}; };

View File

@@ -118,16 +118,16 @@ public:
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift); GRID_ASSERT(psi_d.size()==nshift);
assert(mass.size()==nshift); GRID_ASSERT(mass.size()==nshift);
assert(mresidual.size()==nshift); GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD rsqf[nshift]; std::vector<RealD> rsqf(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;
@@ -141,7 +141,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -179,7 +179,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4); // GRID_ASSERT(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -365,7 +365,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };

View File

@@ -48,12 +48,12 @@ public:
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){} ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); } void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } void OpDir (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); } void OpDirAll (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); }
void Op (const Field &in, Field &out){ assert(0); } void Op (const Field &in, Field &out){ GRID_ASSERT(0); }
void AdjOp (const Field &in, Field &out){ assert(0); } void AdjOp (const Field &in, Field &out){ GRID_ASSERT(0); }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out); linop_base.HermOp(in, out);
@@ -151,16 +151,16 @@ public:
FieldD r_d(DoublePrecGrid); FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift); GRID_ASSERT(psi_d.size()==nshift);
assert(mass.size()==nshift); GRID_ASSERT(mass.size()==nshift);
assert(mresidual.size()==nshift); GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD rsqf[nshift]; std::vector<RealD> rsqf(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;
@@ -174,7 +174,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0); GRID_ASSERT(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -408,7 +408,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };

View File

@@ -35,7 +35,7 @@ template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public: public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -66,7 +66,7 @@ public:
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{ {
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); GRID_ASSERT(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
}; };
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
@@ -90,7 +90,7 @@ public:
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b); Linop_d.HermOpAndNorm(psi, mmp, d, b);
@@ -217,7 +217,7 @@ public:
CG(Linop_d,src,psi); CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete; IterationsToCleanup = CG.IterationsToComplete;
} }
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); else if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return; return;
@@ -263,7 +263,7 @@ public:
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl; << std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
} }

View File

@@ -106,7 +106,7 @@ public:
} }
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -36,7 +36,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FCAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -87,7 +87,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -144,7 +144,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -191,7 +191,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }

View File

@@ -36,7 +36,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -85,7 +85,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -142,7 +142,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -189,7 +189,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }

View File

@@ -36,7 +36,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when GMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -80,7 +80,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -135,7 +135,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -181,7 +181,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }

View File

@@ -175,7 +175,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation),split_test(0), diagonalisation(_diagonalisation),split_test(0),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; { GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -206,7 +206,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
assert(normalize(w,if_print) != 0); GRID_ASSERT(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -225,7 +225,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
assert(normalize(w[i],if_print) !=0); GRID_ASSERT(normalize(w[i],if_print) !=0);
} }
@@ -244,7 +244,7 @@ public:
const uint64_t sites = grid->lSites(); const uint64_t sites = grid->lSites();
int Nbatch = R/Nevec_acc; int Nbatch = R/Nevec_acc;
assert( R%Nevec_acc == 0 ); GRID_ASSERT( R%Nevec_acc == 0 );
// Glog << "nBatch, Nevec_acc, R, Nu = " // Glog << "nBatch, Nevec_acc, R, Nu = "
// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; // << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
@@ -302,7 +302,7 @@ public:
} }
} }
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
assert(normalize(w[i],do_print)!=0); GRID_ASSERT(normalize(w[i],do_print)!=0);
} }
Glog << "cuBLAS Zgemm done"<< std::endl; Glog << "cuBLAS Zgemm done"<< std::endl;
@@ -374,8 +374,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -396,7 +396,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -579,8 +579,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -607,7 +607,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -785,7 +785,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
assert( b < Nm/Nu ); GRID_ASSERT( b < Nm/Nu );
// GridCartesian *grid = evec[0]._grid; // GridCartesian *grid = evec[0]._grid;
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
@@ -796,7 +796,7 @@ private:
Glog << "Using split grid"<< std::endl; Glog << "Using split grid"<< std::endl;
// LatticeGaugeField s_Umu(SGrid); // LatticeGaugeField s_Umu(SGrid);
assert((Nu%mrhs)==0); GRID_ASSERT((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
Field s_in(sf_grid); Field s_in(sf_grid);
@@ -906,7 +906,7 @@ if(split_test){
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
assert (!isnan(norm2(w[u]))); GRID_ASSERT (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
@@ -929,8 +929,8 @@ if(split_test){
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -970,8 +970,8 @@ if(split_test){
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -1119,7 +1119,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -1131,8 +1131,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -1159,8 +1159,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {

View File

@@ -121,7 +121,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation), diagonalisation(_diagonalisation),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; { GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -151,7 +151,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
assert(normalize(w,if_print) != 0); GRID_ASSERT(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -169,7 +169,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
assert(normalize(w[i],if_print) !=0); GRID_ASSERT(normalize(w[i],if_print) !=0);
} }
void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu) void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
@@ -205,8 +205,8 @@ public:
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -227,7 +227,7 @@ public:
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -288,7 +288,7 @@ public:
//---------------------------------------------------------------------- //----------------------------------------------------------------------
if ( Nm>Nk ) { if ( Nm>Nk ) {
Glog <<" #Apply shifted QR transformations "<<std::endl; // Glog <<" #Apply shifted QR transformations "<<std::endl;
//int k2 = Nk+Nu; //int k2 = Nk+Nu;
int k2 = Nk; int k2 = Nk;
@@ -298,6 +298,7 @@ public:
unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM); unpackHermitBlockTriDiagMatToEigen(lmd,lme,Nu,Nblock_m,Nm,Nm,BTDM);
for(int ip=Nk; ip<Nm; ++ip){ for(int ip=Nk; ip<Nm; ++ip){
Glog << " ip "<<ip<<" / "<<Nm<<std::endl;
shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q); shiftedQRDecompEigen(BTDM,Nu,Nm,eval2[ip],Q);
} }
@@ -412,8 +413,8 @@ public:
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -440,7 +441,7 @@ public:
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -467,10 +468,10 @@ public:
// set initial vector // set initial vector
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
// Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl; Glog << "norm2(src[" << i << "])= "<< norm2(src[i]) << std::endl;
evec[i] = src[i]; evec[i] = src[i];
orthogonalize(evec[i],evec,i); orthogonalize(evec[i],evec,i);
// Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl; Glog << "norm2(evec[" << i << "])= "<< norm2(evec[i]) << std::endl;
} }
// exit(-43); // exit(-43);
@@ -506,11 +507,11 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nr,Nr); Qt = Eigen::MatrixXcd::Identity(Nr,Nr);
diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid); diagonalize(eval2,lmd2,lme2,Nu,Nr,Nr,Qt,grid);
_sort.push(eval2,Nr); _sort.push(eval2,Nr);
// Glog << "#Ritz value: "<< std::endl; Glog << "#Ritz value: "<< std::endl;
for(int i=0; i<Nr; ++i){ for(int i=0; i<Nr; ++i){
// std::cout.precision(13); std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
} }
// Convergence test // Convergence test
@@ -570,6 +571,7 @@ public:
Glog << fname + " NOT converged ; Summary :\n"; Glog << fname + " NOT converged ; Summary :\n";
} else { } else {
Glog << fname + " CONVERGED ; Summary :\n"; Glog << fname + " CONVERGED ; Summary :\n";
Nstop = Nconv_guess; // Just take them all
// Sort convered eigenpairs. // Sort convered eigenpairs.
std::vector<Field> Btmp(Nstop,grid); // waste of space replicating std::vector<Field> Btmp(Nstop,grid); // waste of space replicating
@@ -620,7 +622,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
assert( b < Nm/Nu ); GRID_ASSERT( b < Nm/Nu );
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
int L = Nu*b; int L = Nu*b;
@@ -628,7 +630,7 @@ private:
Real beta; Real beta;
assert((Nu%mrhs)==0); GRID_ASSERT((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
std::vector<Field> out(mrhs,f_grid); std::vector<Field> out(mrhs,f_grid);
@@ -709,7 +711,7 @@ private:
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
assert (!isnan(norm2(w[u]))); GRID_ASSERT (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; // Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
@@ -732,8 +734,8 @@ private:
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -773,8 +775,8 @@ private:
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -922,7 +924,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -934,8 +936,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; // Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -951,7 +953,7 @@ if (1){
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
} }
} }
//Glog << "unpackHermitBlockTriDiagMatToEigen() end" << endl; // Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
} }
@@ -962,8 +964,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; // Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -977,7 +979,7 @@ if (1){
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
} }
} }
//Glog << "packHermitBlockTriDiagMatfromEigen() end" << endl; // Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
} }
@@ -1002,6 +1004,7 @@ if (1){
// lower triangular part used to represent series // lower triangular part used to represent series
// of Q sequence. // of Q sequence.
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
// equivalent operation of Qprod *= Q // equivalent operation of Qprod *= Q
//M = Eigen::MatrixXcd::Zero(Nm,Nm); //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1022,6 +1025,7 @@ if (1){
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=0; j<Nm-(Nu+1); ++j) { for (int j=0; j<Nm-(Nu+1); ++j) {
for (int k=0; k<Nu+1+j; ++k) { for (int k=0; k<Nu+1+j; ++k) {
@@ -1029,6 +1033,7 @@ if (1){
} }
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=Nm-(Nu+1); j<Nm; ++j) { for (int j=Nm-(Nu+1); j<Nm; ++j) {
for (int k=0; k<Nm; ++k) { for (int k=0; k<Nm; ++k) {
@@ -1036,6 +1041,7 @@ if (1){
} }
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
//static int ntimes = 2; //static int ntimes = 2;
//for (int j=0; j<Nm-(ntimes*Nu); ++j) { //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1061,11 +1067,13 @@ if (1){
Mtmp(j,i) = conj(Mtmp(i,j)); Mtmp(j,i) = conj(Mtmp(i,j));
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
M = Mtmp; M = Mtmp;
//M = Q.adjoint()*(M*Q); //M = Q.adjoint()*(M*Q);
@@ -1077,7 +1085,7 @@ if (1){
// } // }
//} //}
//Glog << "shiftedQRDecompEigen() end" << endl; // Glog << "shiftedQRDecompEigen() end" <<std::endl;
} }
void exampleQRDecompEigen(void) void exampleQRDecompEigen(void)

View File

@@ -211,7 +211,7 @@ until convergence
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
assert(grid == evec[0].Grid()); GRID_ASSERT(grid == evec[0].Grid());
// GridLogIRL.TimingMode(1); // GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -231,7 +231,7 @@ until convergence
} }
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm <= evec.size() && Nm <= eval.size()); GRID_ASSERT(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
@@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0); // std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; // std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. // RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
src_n = tmp; src_n = tmp;
} }
} }
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm); std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm); std::vector<RealD> lme2(Nm);
@@ -335,7 +337,7 @@ until convergence
} }
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl; std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
assert(k2<Nm); assert(k2<Nm); assert(k1>0); GRID_ASSERT(k2<Nm); GRID_ASSERT(k2<Nm); GRID_ASSERT(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
@@ -461,7 +463,7 @@ until convergence
{ {
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl; std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
assert( k< Nm ); GRID_ASSERT( k< Nm );
GridStopWatch gsw_op,gsw_o; GridStopWatch gsw_op,gsw_o;
@@ -595,7 +597,7 @@ until convergence
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) { } else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -685,7 +687,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
} }
} }
#else #else
assert(0); GRID_ASSERT(0);
#endif #endif
} }

View File

@@ -80,7 +80,7 @@ public:
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace) _Linop(linop), subspace(_subspace)
{ {
assert(subspace.size() >0); GRID_ASSERT(subspace.size() >0);
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
@@ -346,12 +346,12 @@ public:
void testFine(RealD resid) void testFine(RealD resid)
{ {
assert(evals_fine.size() == nbasis); GRID_ASSERT(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis); GRID_ASSERT(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op); ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){ for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1); GRID_ASSERT(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
} }
} }
@@ -359,8 +359,8 @@ public:
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
assert(evals_fine.size() == nbasis); GRID_ASSERT(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis); GRID_ASSERT(subspace.size() == nbasis);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -380,7 +380,7 @@ public:
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
assert(nbasis<=Nm); GRID_ASSERT(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms); Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp); FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
@@ -400,8 +400,8 @@ public:
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved // Shrink down to number saved
assert(Nstop>=nbasis); GRID_ASSERT(Nstop>=nbasis);
assert(Nconv>=nbasis); GRID_ASSERT(Nconv>=nbasis);
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
@@ -433,7 +433,7 @@ public:
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop); GRID_ASSERT(Nconv>=Nstop);
evals_coarse.resize(Nstop); evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid); evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){ for (int i=0;i<Nstop;i++){

View File

@@ -35,7 +35,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the MR fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -59,7 +59,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
@@ -136,7 +136,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(true_residual / Tolerance < 10000.0); GRID_ASSERT(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
@@ -148,7 +148,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
<< std::endl; << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }

View File

@@ -37,7 +37,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
using OperatorFunction<FieldD>::operator(); using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when MPFGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -91,7 +91,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -150,7 +150,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl; std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
@@ -197,7 +197,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }

View File

@@ -60,6 +60,32 @@ public:
} }
}; };
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> { template<class Field> class HPDSolver : public LinearFunction<Field> {
private: private:
LinearOperatorBase<Field> & _Matrix; LinearOperatorBase<Field> & _Matrix;

View File

@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
const int _MAX_ITER_EST_ = 100; const int _MAX_ITER_EST_ = 200;
for (int i=0;i<_MAX_ITER_EST_;i++) { for (int i=0;i<_MAX_ITER_EST_;i++) {
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { // if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na; // evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; // return evalMaxApprox;
return evalMaxApprox; // }
}
evalMaxApprox = na; evalMaxApprox = na;
src_n = tmp; src_n = tmp;
} }
assert(0); std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return 0; return evalMaxApprox;
} }
}; };
} }

View File

@@ -0,0 +1,76 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@@ -112,7 +112,7 @@ public:
} }
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -118,7 +118,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0); // GRID_ASSERT(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -221,7 +221,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0); int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -231,7 +231,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
assert(0); // never reached GRID_ASSERT(0); // never reached
return cp; return cp;
} }
}; };

View File

@@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi=Zero(); // psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
@@ -113,7 +113,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0); // GRID_ASSERT(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -224,7 +224,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0); int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -234,7 +234,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
assert(0); // never reached GRID_ASSERT(0); // never reached
return cp; return cp;
} }
}; };

View File

@@ -79,7 +79,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
LinOp.Op(x,r); r = b - r; LinOp.Op(x,r); r = b - r;
assert(normb> 0.0); GRID_ASSERT(normb> 0.0);
resid = norm2(r)/normb; resid = norm2(r)/normb;
if (resid <= Tolerance) { if (resid <= Tolerance) {
@@ -105,8 +105,8 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
for (int i = 1; i <= MaxIterations; i++) { for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests // Breakdown tests
assert( rho != 0.0); GRID_ASSERT( rho != 0.0);
assert( xi != 0.0); GRID_ASSERT( xi != 0.0);
v = (1. / rho) * v_tld; v = (1. / rho) * v_tld;
y = (1. / rho) * y; y = (1. / rho) * y;
@@ -134,10 +134,10 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
ep=Zep.real(); ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl; std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit // Complex Audit
assert(abs(ep)>0); GRID_ASSERT(abs(ep)>0);
beta = ep / delta; beta = ep / delta;
assert(abs(beta)>0); GRID_ASSERT(abs(beta)>0);
v_tld = p_tld - beta * v; v_tld = p_tld - beta * v;
y = v_tld; y = v_tld;
@@ -158,7 +158,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
std::cout << "theta "<<theta<<std::endl; std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl; std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0); GRID_ASSERT(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1); eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
@@ -178,7 +178,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
} }
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl; std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
} }
assert(0); GRID_ASSERT(0);
return; // no convergence return; // no convergence
} }
#else #else

View File

@@ -327,9 +327,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e) // src_o = (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
} }
@@ -347,17 +347,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even); src_e = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -396,13 +396,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -416,17 +416,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even); src_e_i = src_e-tmp; GRID_ASSERT( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e_i,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -461,9 +461,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd ); src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -478,18 +478,18 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even ); src_e_i = src_e - tmp; GRID_ASSERT( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(src_e_i, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd ); setCheckerboard(sol, sol_o); GRID_ASSERT( sol_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{ {
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix); NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd); this->_HermitianRBSolver(_OpEO, src_o, sol_o); GRID_ASSERT(sol_o.Checkerboard() == Odd);
} }
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
@@ -499,6 +499,87 @@ namespace Grid {
} }
}; };
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv // Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
@@ -531,12 +612,12 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -557,12 +638,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o_i,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -603,9 +684,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd ); src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -626,12 +707,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o_i, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
tmp = src_e - tmp; assert( src_e.Checkerboard() == Even ); tmp = src_e - tmp; GRID_ASSERT( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(tmp, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd ); setCheckerboard(sol, sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() == Odd );
}; };
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)

View File

@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x) inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:
RealD scale; RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false); ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid); FineField noise(FineGrid);
FineField Mn(FineGrid); FineField Mn(FineGrid);
@@ -108,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl; hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){ for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]); CG(hermop,noise,subspace[b]);
@@ -124,6 +126,53 @@ public:
} }
} }
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
@@ -160,14 +209,21 @@ public:
int b =0; int b =0;
{ {
ComplexD ip;
// Filter // Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter); Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn); Cheb(hermop,noise,Mn);
// normalise // normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -213,8 +269,18 @@ public:
Mn=*Tnp; Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -226,8 +292,72 @@ public:
} }
} }
assert(b==nn); GRID_ASSERT(b==nn);
} }
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,

View File

@@ -99,7 +99,7 @@ public:
CoarseMatrix AselfInvEven; CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd; CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor; deviceVector<RealD> dag_factor;
/////////////////////// ///////////////////////
// Interface // Interface
@@ -124,9 +124,13 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -161,7 +165,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}; };
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -190,9 +194,14 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -201,10 +210,10 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0); deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) for(int p=0; p<geom.npoint; p++) {
points[p] = geom.points_dagger[p]; acceleratorPut(points[p],geom.points_dagger[p]);
}
auto points_p = &points[0]; auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0]; RealD* dag_factor_p = &dag_factor[0];
@@ -236,7 +245,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@@ -251,8 +260,14 @@ public:
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
@@ -285,7 +300,7 @@ public:
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@@ -294,7 +309,7 @@ public:
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
std::cout <<"MdirAll out size "<< out.size()<<std::endl; std::cout <<"MdirAll out size "<< out.size()<<std::endl;
std::cout <<"MdirAll ndir "<< ndir<<std::endl; std::cout <<"MdirAll ndir "<< ndir<<std::endl;
assert(0); GRID_ASSERT(0);
} }
for(int p=0;p<ndir;p++){ for(int p=0;p<ndir;p++){
MdirCalc(in,out[p],p); MdirCalc(in,out[p],p);
@@ -358,7 +373,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even); GRID_ASSERT(in.Checkerboard() == Even);
out.Checkerboard() = Odd; out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag); DhopInternal(StencilEven, Aodd, in, out, dag);
@@ -368,7 +383,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd); GRID_ASSERT(in.Checkerboard() == Odd);
out.Checkerboard() = Even; out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag); DhopInternal(StencilOdd, Aeven, in, out, dag);
@@ -376,7 +391,7 @@ public:
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr; CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) { if(in.Grid()->_isCheckerBoarded) {
@@ -391,7 +406,7 @@ public:
Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag); DselfInternal(Stencil, *Aself, in, out, dag);
} }
assert(Aself != nullptr); GRID_ASSERT(Aself != nullptr);
} }
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a, void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
@@ -469,14 +484,20 @@ public:
// determine in what order we need the points // determine in what order we need the points
int npoint = geom.npoint-1; int npoint = geom.npoint-1;
Vector<int> points(npoint, 0); deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) for(int p=0; p<npoint; p++) {
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p; int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0]; auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer; deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead)); hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -539,7 +560,7 @@ public:
}); });
} }
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@@ -590,11 +611,13 @@ public:
} }
// GPU readable prefactor // GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, { thread_for(i, nbasis*nbasis, {
int j = i/nbasis; int j = i/nbasis;
int k = i%nbasis; int k = i%nbasis;
dag_factor[i] = dag_factor_eigen(j, k); h_dag_factor[i] = dag_factor_eigen(j, k);
}); });
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
} }
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -674,7 +697,7 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero); evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask; oddmask = one-evenmask;
assert(self_stencil!=-1); GRID_ASSERT(self_stencil!=-1);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){

View File

@@ -99,7 +99,7 @@ public:
} }
} }
} }
assert(nfound==geom.npoint); GRID_ASSERT(nfound==geom.npoint);
ExchangeCoarseLinks(); ExchangeCoarseLinks();
} }
*/ */
@@ -124,7 +124,7 @@ public:
} }
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
{ {
assert(hermitian); GRID_ASSERT(hermitian);
Mult(_A,in,out); Mult(_A,in,out);
// if ( hermitian ) M(in,out); // if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out); // else Mult(_Adag,in,out);
@@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl; std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
} }
#else #else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace) Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{ {
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid(); GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid()); CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace); blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint; const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions(); Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl; std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond(); tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i]; phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond(); tphaseBZ+=usecond();
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl; // std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond(); tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace); blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner; coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner; ComputeProj[p] = coarseInner;
@@ -609,7 +619,7 @@ public:
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]); // _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
} }
} }
virtual void Mdiag (const Field &in, Field &out){ assert(0);}; virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);}; virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
}; };

View File

@@ -80,12 +80,12 @@ public:
// Can be used to do I/O on the operator matrices externally // Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A) void SetMatrix (int p,CoarseMatrix & A)
{ {
assert(A.size()==geom_srhs.npoint); GRID_ASSERT(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]); GridtoBLAS(A[p],BLAS_A[p]);
} }
void GetMatrix (int p,CoarseMatrix & A) void GetMatrix (int p,CoarseMatrix & A)
{ {
assert(A.size()==geom_srhs.npoint); GRID_ASSERT(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]); BLAStoGrid(A[p],BLAS_A[p]);
} }
void CopyMatrix (GeneralCoarseOp &_Op) void CopyMatrix (GeneralCoarseOp &_Op)
@@ -178,14 +178,14 @@ public:
for(int32_t point = 0 ; point < geom.npoint; point++){ for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point; int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
assert(nbr<BLAS_B.size()); GRID_ASSERT(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr]; ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
} }
j++; j++;
} }
} }
assert(j==unpadded_sites); GRID_ASSERT(j==unpadded_sites);
} }
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to) template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{ {
@@ -194,7 +194,7 @@ public:
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid(); GridBase *Fg = from.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension; int nd = Fg->_ndimension;
to.resize(Fg->lSites()); to.resize(Fg->lSites());
@@ -241,10 +241,10 @@ public:
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid(); GridBase *Tg = grid.Grid();
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension; int nd = Tg->_ndimension;
assert(in.size()==Tg->lSites()); GRID_ASSERT(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions(); Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1; size_t nsite = 1;
@@ -669,7 +669,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]; int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
assert(nrhs>=1); GRID_ASSERT(nrhs>=1);
RealD flops,bytes; RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded int64_t osites=in.Grid()->oSites(); // unpadded
@@ -721,7 +721,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl; // std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl; // std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
}; };
virtual void Mdiag (const Field &in, Field &out){ assert(0);}; virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);}; virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
}; };

View File

@@ -67,8 +67,8 @@ public:
} }
int point(int dir, int disp) { int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1); GRID_ASSERT(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4); GRID_ASSERT(base+0 <= dir && dir < base+4);
// directions faster index = new indexing // directions faster index = new indexing
// 4d (base = 0): // 4d (base = 0):
@@ -131,7 +131,7 @@ public:
return p; return p;
} }
} }
assert(0); GRID_ASSERT(0);
return -1; return -1;
} }
void BuildShifts(void) void BuildShifts(void)

View File

@@ -54,7 +54,10 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -66,7 +69,7 @@ public:
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
@@ -100,7 +103,10 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -145,7 +151,10 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -165,19 +174,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
// Cshift on device template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using cshiftAllocator = devAllocator<T>; template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
#else template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
/* /*
template<class T> class vecView template<class T> class vecView
@@ -188,8 +188,9 @@ template<class T> class vecView
ViewMode mode; ViewMode mode;
void * cpu_ptr; void * cpu_ptr;
public: public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(std::vector<T> &refer_to_me,ViewMode _mode) vecView(Vector<T> &refer_to_me,ViewMode _mode)
{ {
cpu_ptr = &refer_to_me[0]; cpu_ptr = &refer_to_me[0];
size = refer_to_me.size(); size = refer_to_me.size();
@@ -205,22 +206,12 @@ template<class T> class vecView
} }
}; };
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode) template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{ {
vecView<T> ret(vec,_mode); // does the open vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed return ret; // must be closed
} }
// Little autoscope assister
template<class View>
class VectorViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
VectorViewCloser(View &_v) : v(_v) {};
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
};
#define autoVecView(v_v,v,mode) \ #define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \ auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v); ViewCloser<decltype(v_v)> _autoView##v_v(v_v);

View File

@@ -16,6 +16,44 @@ NAMESPACE_BEGIN(Grid);
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
uint64_t total_host;; uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void) void MemoryManager::PrintBytes(void)
{ {
std::cout << " MemoryManager : ------------------------------------ "<<std::endl; std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -35,7 +73,7 @@ void MemoryManager::PrintBytes(void)
#ifdef GRID_CUDA #ifdef GRID_CUDA
cuda_mem(); cuda_mem();
#endif #endif
DisplayMallinfo();
} }
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
@@ -254,7 +292,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); GRID_ASSERT(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr; if (ncache == 0) return ptr;
@@ -307,7 +345,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); GRID_ASSERT(omp_in_parallel()==0);
#endif #endif
for(int e=0;e<ncache;e++){ for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {

View File

@@ -211,6 +211,7 @@ private:
#endif #endif
public: public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr); static void NotifyDeletion(void * CpuPtr);
static void Print(void); static void Print(void);
static void PrintAll(void); static void PrintAll(void);

View File

@@ -1,16 +1,15 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...) //#define dprintf(...)
//#define mprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@@ -51,12 +50,12 @@ int MemoryManager::EntryPresent(uint64_t CpuPtr)
{ {
if(AccViewTable.empty()) return 0; if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1)); auto count = AccViewTable.count(CpuPtr); GRID_ASSERT((count==0)||(count==1));
return count; return count;
} }
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
assert(!EntryPresent(CpuPtr)); GRID_ASSERT(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache; AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
@@ -70,9 +69,9 @@ void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,View
} }
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{ {
assert(EntryPresent(CpuPtr)); GRID_ASSERT(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr); auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end()); GRID_ASSERT(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator; return AccCacheIterator;
} }
void MemoryManager::EntryErase(uint64_t CpuPtr) void MemoryManager::EntryErase(uint64_t CpuPtr)
@@ -82,7 +81,7 @@ void MemoryManager::EntryErase(uint64_t CpuPtr)
} }
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.LRU_valid==0); GRID_ASSERT(AccCache.LRU_valid==0);
if (AccCache.transient) { if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr); LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end(); AccCache.LRU_entry = --LRU.end();
@@ -95,7 +94,7 @@ void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.LRU_valid==1); GRID_ASSERT(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry); LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0; AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes; DeviceLRUBytes-=AccCache.bytes;
@@ -109,19 +108,19 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
// Remove from Accelerator, remove entry, without flush // Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool. // Cannot be locked. If allocated Must be in LRU pool.
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); GRID_ASSERT(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) { if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++; DeviceDestroy++;
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL; AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@@ -139,9 +138,9 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
// Take these OUT LRU queue when CPU locked? // Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important. // Cannot take out the table as cpuLock data is important.
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); GRID_ASSERT(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; if (AccCache.accLock!=0) return;
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@@ -163,28 +162,30 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::Flush(AcceleratorViewEntry &AccCache) void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.state==AccDirty); GRID_ASSERT(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
} }
void MemoryManager::Clone(AcceleratorViewEntry &AccCache) void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.state==CpuDirty); GRID_ASSERT(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@@ -193,10 +194,10 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.state!=Empty); GRID_ASSERT(AccCache.state!=Empty);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
@@ -210,33 +211,36 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else { } else {
assert(0); GRID_ASSERT(0);
return NULL; return NULL;
} }
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
assert(bytes<DeviceMaxBytes); if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
GRID_ASSERT(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
assert(LRU.size()>0); GRID_ASSERT(LRU.size()>0);
uint64_t victim = LRU.back(); // From the LRU uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim); auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
@@ -260,19 +264,19 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
if (!AccCache.AccPtr) { if (!AccCache.AccPtr) {
EvictVictims(bytes); EvictVictims(bytes);
} }
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); GRID_ASSERT((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error GRID_ASSERT(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
(uint64_t)bytes, (uint64_t)bytes,
(uint64_t)AccCache.accLock); (uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr); GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes); GRID_ASSERT(AccCache.bytes ==bytes);
} }
/* /*
* State transitions and actions * State transitions and actions
@@ -289,7 +293,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
* AccWrite AccDirty AccDirty - - * AccWrite AccDirty AccDirty - -
*/ */
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0); GRID_ASSERT(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes; AccCache.bytes = bytes;
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@@ -318,30 +322,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
assert(AccCache.accLock>0); GRID_ASSERT(AccCache.accLock>0);
// If view is opened on device must remove from LRU // If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n"); dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache); LRUremove(AccCache);
} }
@@ -358,16 +362,16 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock>0); GRID_ASSERT(AccCache.accLock>0);
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else { } else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -375,8 +379,8 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0); GRID_ASSERT(AccCache.cpuLock>0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
AccCache.cpuLock--; AccCache.cpuLock--;
} }
@@ -409,12 +413,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// EvictVictims(bytes); // EvictVictims(bytes);
// } // }
assert((mode==CpuRead)||(mode==CpuWrite)); GRID_ASSERT((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error GRID_ASSERT(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr); GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes); GRID_ASSERT(AccCache.bytes==bytes);
} }
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
@@ -429,20 +433,20 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL); GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite) if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL); GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache); Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++; AccCache.cpuLock++;
} else { } else {
assert(0); // should be unreachable GRID_ASSERT(0); // should be unreachable
} }
AccCache.transient= transient? EvictNext : 0; AccCache.transient= transient? EvictNext : 0;
@@ -524,12 +528,12 @@ void MemoryManager::Audit(std::string s)
std::cout << " Memory Manager::Audit() from "<<s<<std::endl; std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){ for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it; uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr)); GRID_ASSERT(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr); auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes; LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1); GRID_ASSERT(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it); GRID_ASSERT(AccCache.LRU_entry==it);
} }
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl; std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
@@ -548,7 +552,7 @@ void MemoryManager::Audit(std::string s)
if( AccCache.LRU_valid ) LruCnt++; if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) { if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0); GRID_ASSERT(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
@@ -557,16 +561,16 @@ void MemoryManager::Audit(std::string s)
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl; << "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
} }
assert( AccCache.cpuLock== 0 ) ; GRID_ASSERT( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ; GRID_ASSERT( AccCache.accLock== 0 ) ;
} }
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl; std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2); GRID_ASSERT(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes); GRID_ASSERT(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes); GRID_ASSERT(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size()); GRID_ASSERT(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl; std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
} }

View File

@@ -10,16 +10,16 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY); int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0); GRID_ASSERT(fd >= 0);
const int page_size = 4096; const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size; uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn; off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size; uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages]; std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET); uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset); GRID_ASSERT(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages); GRID_ASSERT(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512; int nhugepages = npages / 512;
int n4ktotal, nnothuge; int n4ktotal, nnothuge;
n4ktotal = 0; n4ktotal = 0;

View File

@@ -82,6 +82,7 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim;
public: public:
@@ -164,7 +165,7 @@ public:
// //
if ( _simd_layout[dimension] > 2 ) { if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) ); if ( d != dimension ) GRID_ASSERT ( (_simd_layout[d]==1) );
} }
permute_type = RotateBit; // How to specify distance; this is not just direction. permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type; return permute_type;
@@ -186,7 +187,7 @@ public:
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; }; inline const Coordinate &LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;}; inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;}; inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;}; inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
@@ -215,11 +216,11 @@ public:
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){ void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites()); GRID_ASSERT(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){ void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites()); GRID_ASSERT(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){ void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){

View File

@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -106,6 +106,7 @@ public:
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);; _checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -127,10 +128,10 @@ public:
// Use a reduced simd grid // Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl; //std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;

View File

@@ -57,16 +57,17 @@ class GridRedBlackCartesian : public GridBase
{ {
public: public:
// Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; // int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const Coordinate &site){ virtual int CheckerBoard(const Coordinate &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); GRID_ASSERT(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d]) if(_checker_dim_mask[d])
linear=linear+site[d]; linear=linear+site[d];
@@ -159,11 +160,11 @@ public:
_isCheckerBoarded = true; _isCheckerBoarded = true;
_checker_dim = checker_dim; _checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1); GRID_ASSERT(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension); GRID_ASSERT(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension); GRID_ASSERT(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension); GRID_ASSERT(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
@@ -189,20 +190,20 @@ public:
if (d == _checker_dim) if (d == _checker_dim)
{ {
assert((_gdimensions[d] & 0x1) == 0); GRID_ASSERT((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2; _gsites /= 2;
} }
_ldimensions[d] = _gdimensions[d] / _processors[d]; _ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid // Use a reduced simd grid
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0); GRID_ASSERT(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard. // all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d // If Ls vectorised, this must still be the case; e.g. dwf rb5d

View File

@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c) void CartesianCommunicator::GlobalSum(ComplexD &c)
{ {
GlobalSumVector((double *)&c,2); GlobalSumVector((double *)&c,2);
} }
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);

View File

@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ; extern bool Stencil_force_mpi ;
@@ -106,7 +108,7 @@ public:
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ; static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes); static void BroadcastWorld(int root,void* data, uint64_t bytes);
static void BarrierWorld(void); static void BarrierWorld(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@@ -128,6 +130,35 @@ public:
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering GRID_ASSERT in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
@@ -138,32 +169,44 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list); void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir); uint64_t bytes,int dir);
void SendToRecvFrom(void *xmit, void SendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes); uint64_t bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); uint64_t bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir); uint64_t xbytes,uint64_t rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp,
int recv_from_rank,int do_recv,
uint64_t xbytes,uint64_t rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -177,20 +220,20 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger // Broadcast a buffer and composite larger
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes); void Broadcast(int root,void* data, uint64_t bytes);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// All2All down one dimension // All2All down one dimension
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){ template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0); GRID_ASSERT(dim>=0);
assert(dim<_ndimension); GRID_ASSERT(dim<_ndimension);
assert(in.size()==out.size()); GRID_ASSERT(in.size()==out.size());
int numnode = _processors[dim]; int numnode = _processors[dim];
uint64_t bytes=sizeof(T); uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode; uint64_t words=in.size()/numnode;
assert(numnode * words == in.size()); GRID_ASSERT(numnode * words == in.size());
assert(words < (1ULL<<31)); GRID_ASSERT(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes); AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
} }
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes); void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);

View File

@@ -28,9 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
void GridAbort(void) { MPI_Abort(MPI_COMM_WORLD,SIGABRT); }
extern void * Grid_backtrace_buffer[_NBACKTRACE];
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
#ifdef GRID_CHECKSUM_COMMS
uint64_t checksum_index = 1;
#endif
//////////////////////////////////////////// ////////////////////////////////////////////
// First initialise of comms system // First initialise of comms system
@@ -55,11 +63,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
#endif #endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0); GRID_ASSERT(0);
} }
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -80,20 +88,20 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0); GRID_ASSERT(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -120,8 +128,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); GRID_ASSERT(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; GRID_ASSERT(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1); Coordinate shm_processors (_ndimension,1);
@@ -145,7 +153,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
childsize *= processors[d]; childsize *= processors[d];
} }
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent); GRID_ASSERT (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent Coordinate scoor(_ndimension); // coor of split within parent
@@ -171,12 +179,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split); int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0); GRID_ASSERT(ierr==0);
} else { } else {
srank = 0; srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split); int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -201,7 +209,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
} }
} }
for(int d=0;d<processors.size();d++){ for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] ); GRID_ASSERT(_processor_coor[d] == ccoor[d] );
} }
} }
@@ -243,7 +251,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
for(int i=0;i<_ndimension*2;i++){ for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]); MPI_Comm_dup(communicator,&communicator_halo[i]);
} }
assert(Size==_Nprocessors); GRID_ASSERT(Size==_Nprocessors);
} }
CartesianCommunicator::~CartesianCommunicator() CartesianCommunicator::~CartesianCommunicator()
@@ -257,82 +265,103 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
void CartesianCommunicator::GlobalSum(uint32_t &u){ #ifdef USE_GRID_REDUCTION
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); FlightRecorder::StepLog("GlobalSumP2P");
assert(ierr==0); CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(double &d) void CartesianCommunicator::GlobalSum(double &d)
{ {
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir) uint64_t bytes,int dir)
{ {
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
assert(dest != _processor); GRID_ASSERT(dest != _processor);
assert(from != _processor); GRID_ASSERT(from != _processor);
int tag; int tag;
tag= dir+from*32; tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@@ -340,7 +369,7 @@ void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.resize(0); list.resize(0);
} }
@@ -349,50 +378,63 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) uint64_t bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
// Enforce no UVM in comms, device or host OK // Enforce no UVM in comms, device or host OK
assert(acceleratorIsCommunicable(xmit)); GRID_ASSERT(acceleratorIsCommunicable(xmit));
assert(acceleratorIsCommunicable(recv)); GRID_ASSERT(acceleratorIsCommunicable(recv));
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank,
recv,bytes,MPI_CHAR,from, from, recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
assert(ierr==0); GRID_ASSERT(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox, int dest, int dox,
void *recv, void *recv,
int from, int dor, int from, int dor,
int bytes,int dir) uint64_t bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
int CartesianCommunicator::IsOffNode(int rank)
{
int grank = ShmRanks[rank];
if ( grank == MPI_UNDEFINED ) return true;
else return false;
}
#undef NVLINK_GET // Define to use get instead of put DMA #ifdef ACCELERATOR_AWARE_MPI
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) uint64_t xbytes,uint64_t rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@@ -405,62 +447,431 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int gfrom = ShmRanks[from]; int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor]; int gme = ShmRanks[_processor];
assert(dest != _processor); GRID_ASSERT(dest != _processor);
assert(from != _processor); GRID_ASSERT(from != _processor);
assert(gme == ShmRank); GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag; int tag;
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); // std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
assert(ierr==0); ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
#ifdef NVLINK_GET #ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL); GRID_ASSERT(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif #endif
} }
// This is a NVLINK PUT
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
} else { } else {
#ifndef NVLINK_GET #ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL); GRID_ASSERT(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif #endif
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
GRID_ASSERT(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
#ifdef GRID_CHECKSUM_COMMS
rbytes += 8;
xbytes += 8;
#endif
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
#ifdef GRID_CHECKSUM_COMMS
uint64_t xbytes_data = xbytes - 8;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
GRID_ASSERT(xbytes % 8 == 0);
// flip one bit so that a zero buffer is not consistent
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
#else
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
#endif
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// GRID_ASSERT(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
} }
} }
return off_node_bytes; return off_node_bytes;
} }
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
#ifdef GRID_CHECKSUM_COMMS
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
#else
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
#endif
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint64_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
acceleratorCopySynchronise(); std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
if (nreq==0) return; for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
std::vector<MPI_Status> status(nreq); if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); // if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
assert(ierr==0);
list.resize(0);
} }
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
uint64_t rbytes_data = list[r].bytes - 8;
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
if (expected_cs != computed_cs) {
// TODO: error message, backtrace, quit
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
fprintf(stderr, " processor = %d\n", (int)_processor);
for(int d=0;d<_processors.size();d++)
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
fprintf(stderr, " hostname: %s\n", GridHostname());
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
fprintf(stderr, " dest: %d\n", list[r].dest);
fprintf(stderr, " tag: %d\n", list[r].tag);
fprintf(stderr, " commdir: %d\n", list[r].commdir);
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
fflush(stderr);
// backtrace
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
exit(1);
}
}
}
checksum_index += 1;
#endif
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);
} }
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -468,17 +879,19 @@ void CartesianCommunicator::StencilBarrier(void)
//} //}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes)
{ {
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
bytes, (int)bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator); communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
int CartesianCommunicator::RankWorld(void){ int CartesianCommunicator::RankWorld(void){
int r; int r;
@@ -486,23 +899,25 @@ int CartesianCommunicator::RankWorld(void){
return r; return r;
} }
void CartesianCommunicator::BarrierWorld(void){ void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world); int ierr = MPI_Barrier(communicator_world);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, (int)bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator_world); communicator_world);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
Coordinate row(_ndimension,1); Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension); GRID_ASSERT(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
row[dim] = _processors[dim]; row[dim] = _processors[dim];
@@ -513,6 +928,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
} }
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{ {
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments // MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data. // 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
@@ -522,8 +938,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int ibytes; int ibytes;
iwords = words; iwords = words;
ibytes = bytes; ibytes = bytes;
assert(words == iwords); // safe to cast to int ? GRID_ASSERT(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ? GRID_ASSERT(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object); MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object); MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);

View File

@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
void GridAbort(void) { abort(); }
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::Init(communicator_world);
@@ -54,14 +56,14 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_shm_processors = Coordinate(processors.size(),1); _shm_processors = Coordinate(processors.size(),1);
_processors = processors; _processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); GRID_ASSERT(_ndimension>=1);
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake // Require 1^N processor grid for fake
_Nprocessors=1; _Nprocessors=1;
_processor = 0; _processor = 0;
for(int d=0;d<_ndimension;d++) { for(int d=0;d<_ndimension;d++) {
assert(_processors[d]==1); GRID_ASSERT(_processors[d]==1);
_processor_coor[d] = 0; _processor_coor[d] = 0;
} }
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
@@ -87,19 +89,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) uint64_t bytes)
{ {
assert(0); GRID_ASSERT(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);} void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ GRID_ASSERT(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir) uint64_t bytes,int dir)
{ {
assert(0); GRID_ASSERT(0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
@@ -113,8 +115,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int CartesianCommunicator::RankWorld(void){return 0;} int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { }
void CartesianCommunicator::BarrierWorld(void) { } void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
@@ -124,20 +126,33 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0; dest=0;
} }
int CartesianCommunicator::IsOffNode(int rank) { return false; }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int bytes, int dir) uint64_t bytes, int dir)
{ {
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int xbytes,int rbytes, int dir) uint64_t xbytes,uint64_t rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit_comp,
int xmit_to_rank,int dox,
void *recv, void *recv_comp,
int recv_from_rank,int dor,
uint64_t xbytes,uint64_t rbytes, int dir)
{ {
return xbytes+rbytes; return xbytes+rbytes;
} }

View File

@@ -58,8 +58,8 @@ int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void) void GlobalSharedMemory::SharedMemoryFree(void)
{ {
assert(_ShmAlloc); GRID_ASSERT(_ShmAlloc);
assert(_ShmAllocBytes>0); GRID_ASSERT(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes); munmap(WorldShmCommBufs[r],_ShmAllocBytes);
} }
@@ -80,7 +80,7 @@ void *SharedMemory::HostBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size); GRID_ASSERT(host_heap_bytes<host_heap_size);
} }
return ptr; return ptr;
} }
@@ -100,7 +100,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size); GRID_ASSERT(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
@@ -127,13 +127,13 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
if ( str ) { if ( str ) {
std::vector<int> IntShmDims; std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims); GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size()); GRID_ASSERT(IntShmDims.size() == WorldDims.size());
long ShmSize = 1; long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) { for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]); ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim])); GRID_ASSERT(divides(ShmDims[dim],WorldDims[dim]));
} }
assert(ShmSize == WorldShmSize); GRID_ASSERT(ShmSize == WorldShmSize);
return; return;
} }

View File

@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
@@ -105,7 +137,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes); // static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -42,6 +42,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@@ -66,7 +67,7 @@ public:
{ {
int errnum; int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0); sock = socket(AF_UNIX, SOCK_DGRAM, 0); GRID_ASSERT(sock>0);
struct sockaddr_un sa_un = { 0 }; struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX; sa_un.sun_family = AF_UNIX;
@@ -157,7 +158,7 @@ public:
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
assert(_ShmSetup==0); GRID_ASSERT(_ShmSetup==0);
WorldComm = comm; WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank); MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize); MPI_Comm_size(WorldComm,&WorldSize);
@@ -183,7 +184,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// WorldNodes // WorldNodes
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize ); GRID_ASSERT( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
@@ -208,7 +209,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MyGroup.resize(WorldShmSize); MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){ for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){ if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize); GRID_ASSERT(g<WorldShmSize);
MyGroup[g++] = rank; MyGroup[g++] = rank;
} }
} }
@@ -224,7 +225,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// global sum leaders over comm world // global sum leaders over comm world
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0); GRID_ASSERT(ierr==0);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// find the group leaders world rank // find the group leaders world rank
@@ -245,7 +246,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNode=g; WorldNode=g;
} }
} }
assert(WorldNode!=-1); GRID_ASSERT(WorldNode!=-1);
_ShmSetup=1; _ShmSetup=1;
} }
// Gray encode support // Gray encode support
@@ -287,7 +288,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Assert power of two shm_size. // Assert power of two shm_size.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1); GRID_ASSERT(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname // Identify the hypercube coordinate of this node using hostname
@@ -308,7 +309,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Parse ICE-XA hostname to get hypercube location // Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3); GRID_ASSERT(nscan==3);
int nlo = N%9; int nlo = N%9;
int nhi = N/9; int nhi = N/9;
@@ -332,8 +333,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor; hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize); GRID_ASSERT(hypercoor<WorldSize);
assert(hypercoor>=0); GRID_ASSERT(hypercoor>=0);
////////////////////////////////////// //////////////////////////////////////
// Printing // Printing
@@ -381,7 +382,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
assert(WorldSize==Nprocessors); GRID_ASSERT(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -400,7 +401,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{ {
@@ -430,7 +431,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
assert(WorldSize==Nprocessors); // std::cerr << " WorldSize "<<WorldSize << " Nprocessors "<<Nprocessors<<" "<<processors<<std::endl;
GRID_ASSERT(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -446,7 +448,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET // SHMGET
@@ -455,8 +457,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
@@ -517,8 +519,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group // allocate the pointer array for shared windows for our group
@@ -537,7 +539,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
HostCommBuf= malloc(bytes); // printf("Host buffer allocate for GPU non-aware MPI\n");
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
@@ -545,11 +548,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << Mheader " acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl; if ( WorldRank == 0 ){
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -569,8 +574,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
@@ -580,8 +585,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid(); handle.pid = getpid();
@@ -626,7 +629,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
MPI_BYTE, MPI_BYTE,
r, r,
WorldShmComm); WorldShmComm);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -640,12 +643,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS #ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor(); myfd=UnixSockets::RecvFileDescriptor();
#else #else
std::cout<<"mapping seeking remote pid/fd " // std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/" // <<handle.pid<<"/"
<<handle.fd<<std::endl; // <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0); int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; // std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0); myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno; int err_t = errno;
@@ -655,7 +658,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0); assert(0);
} }
#endif #endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; // std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -664,11 +667,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
} }
assert(thisBuf!=nullptr); GRID_ASSERT(thisBuf!=nullptr);
} }
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@@ -709,8 +709,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -740,13 +740,14 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0); perror("failed mmap"); GRID_ASSERT(0);
} }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
}; };
@@ -756,8 +757,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -768,7 +769,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbf and others map filesystems as mappable huge pages // Hugetlbf and others map filesystems as mappable huge pages
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX]; char shm_name [NAME_MAX];
assert(WorldShmSize == 1); GRID_ASSERT(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
int fd=-1; int fd=-1;
@@ -782,9 +783,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0); perror("failed mmap"); GRID_ASSERT(0);
} }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -803,8 +804,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize); WorldShmCommBufs.resize(WorldShmSize);
@@ -835,7 +836,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
} }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -856,8 +857,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( fd<0 ) { perror("failed shm_open"); assert(0); } if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } if ( ptr == MAP_FAILED ) { perror("failed mmap"); GRID_ASSERT(0); }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -880,14 +881,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) //#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
#else //#else
bcopy(src,dest,bytes); // bcopy(src,dest,bytes);
#endif //#endif
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@@ -914,7 +915,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer // Map ShmRank to WorldShmRank and use the right buffer
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1); GRID_ASSERT (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes(); heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
@@ -923,6 +924,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@@ -975,19 +977,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2]=magic; check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t)); acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier(); acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t)); GRID_ASSERT(check[0]==GlobalSharedMemory::WorldNode);
ShmBarrier(); GRID_ASSERT(check[1]==r);
assert(check[0]==GlobalSharedMemory::WorldNode); GRID_ASSERT(check[2]==magic);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
} }
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
@@ -1002,12 +1003,14 @@ void *SharedMemory::ShmBuffer(int rank)
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self GRID_ASSERT(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
return NULL; return NULL;
} else { } else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
return (void *) remote; return (void *) remote;
} }
} }

View File

@@ -34,7 +34,7 @@ NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
assert(_ShmSetup==0); GRID_ASSERT(_ShmSetup==0);
WorldComm = 0; WorldComm = 0;
WorldRank = 0; WorldRank = 0;
WorldSize = 1; WorldSize = 1;
@@ -62,8 +62,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ; void * ShmCommBuf ;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
@@ -92,8 +92,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
int mmap_flag =0; int mmap_flag =0;
#ifdef MAP_ANONYMOUS #ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS; mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
@@ -122,17 +122,17 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
acceleratorMemSet(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{ {
assert(GlobalSharedMemory::ShmAlloc()==1); GRID_ASSERT(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1); ShmRanks.resize(1);
ShmCommBufs.resize(1); ShmCommBufs.resize(1);
ShmRanks[0] = 0; ShmRanks[0] = 0;

View File

@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{ {

View File

@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table; extern std::vector<std::pair<int,int> > Cshift_table;
extern commVector<std::pair<int,int> > Cshift_table_device; extern deviceVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void) inline std::pair<int,int> *MapCshiftTable(void)
{ {
// GPU version // GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size(); uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) { if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz); Cshift_table_device.resize(sz);
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
sizeof(Cshift_table[0])*sz); sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0]; return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map // CPU version use identify map
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWriteDiscard);
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@@ -278,8 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWriteDiscard);
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
int b = nn/e1; int b = nn/e1;
@@ -287,21 +236,13 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout << "Scatter_plane merge GRID_ASSERT(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU GRID_ASSERT(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite); autoView(lhs_v , lhs, AcceleratorWriteDiscard);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }

View File

@@ -29,9 +29,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_MPI_H_ #ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_CHECKSUM_COMMS
extern uint64_t checksum_index;
#endif
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -45,6 +49,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
if( shift ==0 ) {
ret = rhs;
return ret;
}
//
// Potential easy fast cases:
// Shift is a multiple of the local lattice extent.
// Then need only to shift whole subvolumes
int L = rhs.Grid()->_ldimensions[dimension];
if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
Cshift_simple(ret,rhs,dimension,shift);
return ret;
}
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type // the permute type
@@ -65,10 +83,59 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
Cshift_comms(ret,rhs,dimension,shift); Cshift_comms(ret,rhs,dimension,shift);
} }
t1=usecond(); t1=usecond();
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret; return ret;
} }
template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
GridBase *grid=rhs.Grid();
int comm_proc, xmit_to_rank, recv_from_rank;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int ld = rhs.Grid()->_ldimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
comm_proc = ((shift)/ld)%pd;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
if(comm_dim) {
int64_t bytes = sizeof(vobj) * grid->oSites();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(ret_v , ret, AcceleratorWrite);
void *send_buf = (void *)&rhs_v[0];
void *recv_buf = (void *)&ret_v[0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom(send_buf,
xmit_to_rank,
recv_buf,
recv_from_rank,
bytes);
#else
static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
static hostVector<vobj> hret; hret.resize(grid->oSites());
void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom(hsend_buf,
xmit_to_rank,
hrecv_buf,
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
}
}
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{ {
int sshift[2]; int sshift[2];
@@ -94,7 +161,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3); Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@@ -104,8 +171,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -119,14 +184,19 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int pd = rhs.Grid()->_processors[dimension]; int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1); GRID_ASSERT(simd_layout==1);
assert(comm_dim==1); GRID_ASSERT(comm_dim==1);
assert(shift>=0); GRID_ASSERT(shift>=0);
assert(shift<fd); GRID_ASSERT(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size); static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size); static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
#endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -141,9 +211,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) { if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond(); tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask); Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond(); tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else { } else {
int words = buffer_size; int words = buffer_size;
@@ -151,39 +223,84 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond(); tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond(); tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
// grid->Barrier(); grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
GRID_ASSERT(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
GRID_ASSERT(expected_cs == computed_cs);
#else
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes; xbytes+=bytes;
// grid->Barrier(); grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
} }
/* if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/ }
} }
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -205,10 +322,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout // << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; // << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1); GRID_ASSERT(comm_dim==1);
assert(simd_layout==2); GRID_ASSERT(simd_layout==2);
assert(shift>=0); GRID_ASSERT(shift>=0);
assert(shift<fd); GRID_ASSERT(shift<fd);
RealD tcopy=0.0; RealD tcopy=0.0;
RealD tgather=0.0; RealD tgather=0.0;
@@ -224,8 +341,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi; scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi; scalar_object * send_buf_extract_mpi;
@@ -233,6 +350,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract[s].resize(buffer_size); send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI
#ifdef GRID_CHECKSUM_COMMS
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
#endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -275,252 +404,62 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if (nbr_ic) nbr_lane|=inner_bit; if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox); GRID_ASSERT (sx == nbr_ox);
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
// grid->Barrier(); grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi, grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
#else #else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) // bouncy bouncy
{ acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
typedef typename vobj::vector_type vector_type; #ifdef GRID_CHECKSUM_COMMS
typedef typename vobj::scalar_type scalar_type; assert(bytes % 8 == 0);
checksum_index++;
GridBase *grid=rhs.Grid(); uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
Lattice<vobj> temp(rhs.Grid()); *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
int fd = rhs.Grid()->_fdimensions[dimension]; #endif
int rd = rhs.Grid()->_rdimensions[dimension]; grid->SendToRecvFrom((void *)&hsend_buf[0],
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&hrecv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
xbytes+=bytes; #ifdef GRID_CHECKSUM_COMMS
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); bytes -= 8;
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
// grid->Barrier(); std::cout << GridLogComms<< " Cshift_comms_simd: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
rpointers[i] = &send_buf_extract[nbr_lane][0]; rpointers[i] = &send_buf_extract[nbr_lane][0];
@@ -530,17 +469,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
/* if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
} }
#endif }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -1,5 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table; std::vector<std::pair<int,int> > Cshift_table;
commVector<std::pair<int,int> > Cshift_table_device; deviceVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -245,7 +245,7 @@ template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * =
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.Checkerboard()); GRID_ASSERT(cb == lat.Checkerboard());
} }
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }

View File

@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product

View File

@@ -120,12 +120,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -144,12 +144,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -168,12 +168,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
@@ -191,11 +191,11 @@ public:
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr); GRID_ASSERT(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -206,11 +206,11 @@ public:
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr); GRID_ASSERT(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -221,11 +221,11 @@ public:
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr); GRID_ASSERT(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -236,11 +236,21 @@ public:
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp; vobj vtmp;
vtmp = r; vtmp = r;
#if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite); auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(vtmp); auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp); coalescedWrite(me[ss],stmp);
}); });
#else
auto me = View(CpuWrite);
thread_for(ss,me.size(),{
me[ss]= r;
});
#endif
me.ViewClose(); me.ViewClose();
return *this; return *this;
} }
@@ -254,7 +264,7 @@ public:
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); GRID_ASSERT((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode); SetViewMode(mode);
} }

View File

@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size()); hostVector<View> h_basis_v(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj; deviceVector<View> d_basis_v(basis.size());
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite)); h_basis_v[k] = basis[k].View(AcceleratorWrite);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) View *basis_vp = &d_basis_v[0];
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot); deviceVector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm); hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
Qt_p[i]=Qt(j,k); h_Qt_jv[i]=Qt(j,k);
}); });
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size()); hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead)); h_basis_v[k]=basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0]; vobj zz=Zero();
deviceVector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
auto basis_vp=& d_basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); vobj zzz=Zero();
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
} }
template<class Field> template<class Field>
@@ -179,9 +166,9 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
{ {
int vlen = idx.size(); int vlen = idx.size();
assert(vlen>=1); GRID_ASSERT(vlen>=1);
assert(vlen<=sort_vals.size()); GRID_ASSERT(vlen<=sort_vals.size());
assert(vlen<=_v.size()); GRID_ASSERT(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) { for (size_t i=0;i<vlen;i++) {
@@ -199,7 +186,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
if (idx[j]==i) if (idx[j]==i)
break; break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i); GRID_ASSERT(idx[i] > i); GRID_ASSERT(j!=idx.size()); GRID_ASSERT(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
@@ -237,7 +224,7 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
template<class Field> template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) { void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero(); result = Zero();
assert(_v.size()==eval.size()); GRID_ASSERT(_v.size()==eval.size());
int N = (int)_v.size(); int N = (int)_v.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
Field& tmp = _v[i]; Field& tmp = _v[i];

View File

@@ -32,8 +32,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{ {
assert(lhs.Grid() == rhs.Grid()); GRID_ASSERT(lhs.Grid() == rhs.Grid());
assert(lhs.Checkerboard() == rhs.Checkerboard()); GRID_ASSERT(lhs.Checkerboard() == rhs.Checkerboard());
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -42,7 +42,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -86,7 +86,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid(); GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1); GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -140,7 +140,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1); GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl = nh-1; // int nl = nh-1;

View File

@@ -98,8 +98,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); GRID_ASSERT( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx; int rank,odx,idx;
// Optional to broadcast from node 0. // Optional to broadcast from node 0.
@@ -135,7 +135,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); GRID_ASSERT( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx; int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -159,14 +159,14 @@ template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site) inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid = l.getGrid(); GridBase *grid = l.getGrid();
assert(l.mode==CpuRead); GRID_ASSERT(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); // GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
@@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx); pt[w] = getlane(vp[w],idx);
} }
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return; return;
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
@@ -195,15 +195,15 @@ template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid=l.getGrid(); GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite); GRID_ASSERT(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); // GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;

View File

@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -264,24 +264,8 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. // Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
});
}
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t; typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{ {
@@ -295,7 +279,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
}); });
} }
#endif
// This is in single precision and fails some tests // This is in single precision and fails some tests
auto anrm = sumD(inner_tmp_v,sites); auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm; nrm = anrm;
@@ -307,23 +290,45 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t csum=0; // uint64_t csum=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) // uint64_t csum2=0;
{ // if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
// {
// Hack // Hack
// Fast integer xor checksum. Can also be used in comms now. // Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead); // autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); // Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0]; // uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words); // csum=svm_xor(base,words);
} // ok = FlightRecorder::CsumLog(csum);
FlightRecorder::CsumLog(csum); // if ( !ok ) {
// csum2=svm_xor(base,words);
// std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// } else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// }
// GRID_ASSERT(ok);
// }
#endif #endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right); ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm); RealD local = real(nrm);
FlightRecorder::NormLog(real(nrm)); ok = FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm); if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
GRID_ASSERT(ok);
}
FlightRecorder::StepLog("Start global sum");
grid->GlobalSumP2P(nrm);
// grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -360,20 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
@@ -381,9 +375,13 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
bool ok;
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
#endif ok = FlightRecorder::NormLog(real(nrm));
GRID_ASSERT(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -393,7 +391,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right); conformable(left,right);
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2); std::vector<ComplexD> tmp(2);
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
@@ -403,8 +401,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU // GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); deviceVector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{ {
@@ -454,7 +452,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{ {
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// FIXME precision promoted summation // FIXME precision promoted summation
@@ -464,20 +464,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type; typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
assert(grid!=NULL); GRID_ASSERT(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0); GRID_ASSERT(orthogdim >= 0);
assert(orthogdim < Nd); GRID_ASSERT(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first std::vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
@@ -525,6 +525,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
scalar_type * ptr = (scalar_type *) &result[0]; scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
} }
template<class vobj> inline template<class vobj> inline
std::vector<typename vobj::scalar_object> std::vector<typename vobj::scalar_object>
@@ -535,28 +537,41 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result; return result;
} }
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid(); GridBase *grid = lhs.Grid();
assert(grid!=NULL); GRID_ASSERT(grid!=NULL);
conformable(grid,rhs.Grid()); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0); GRID_ASSERT(orthogdim >= 0);
assert(orthogdim < Nd); GRID_ASSERT(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first std::vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -686,203 +701,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
} }
}; };
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{ {
int NN = BlockSolverGrid->_ndimension; int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd(); int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0); std::vector<int> latt_phys(NN-1);
std::vector<int> simd_phys(0); Coordinate simd_phys;
std::vector<int> mpi_phys(0); std::vector<int> mpi_phys(NN-1);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){ for(int d=0;d<NN;d++){
if( d!=Orthog ) { if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); mpi_phys[dd] =BlockSolverGrid->_processors[d];
mpi_phys.push_back(BlockSolverGrid->_processors[d]); checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
} }
} }
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
} }
*/
template<class vobj> template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
GridBase *FullGrid = X.Grid();
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
RR = R; // Copies checkerboard for insert
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; for(int i=0;i<Nslice;i++){
ExtractSlice(Ys,Y,i,Orthog);
GridBase *FullGrid = X.Grid(); ExtractSlice(Rs,R,i,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); Rs=Ys;
for(int j=0;j<Nslice;j++){
// Lattice<vobj> Xslice(SliceGrid); ExtractSlice(Xs,X,j,Orthog);
// Lattice<vobj> Rslice(SliceGrid); Rs = Rs + Xs*(scale*aa(j,i));
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
} }
InsertSlice(Rs,RR,i,Orthog);
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
} }
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
}; };
template<class vobj> template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; R=Zero();
typedef typename vobj::vector_type vector_type; sliceMaddMatrix(R,aa,X,R,Orthog,scale);
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
}; };
template<class vobj> template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = lhs.Grid(); mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); for(int s=0;s<Nslice;s++){
ExtractSlice(ls,lhs,s,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; for(int ss=0;ss<Nslice;ss++){
ExtractSlice(rs,rhs,ss,Orthog);
// Lattice<vobj> Lslice(SliceGrid); mat(s,ss) = innerProduct(ls,rs);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
} }
} }
delete SliceGrid;
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -208,28 +208,18 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
Integer numThreads, numBlocks; Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok); GRID_ASSERT(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM // Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream // Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise // as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER deviceVector<sobj> buffer(numBlocks);
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result; return result;
} }
@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites); deviceVector<vector> buffer(osites);
vector *dat = (vector *)lat; vector *dat = (vector *)lat;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0]; iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];

View File

@@ -4,29 +4,28 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum // Possibly promote to double and sum
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD; typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity); sobj identity; zeroit(identity);
sobj ret ; sobj ret; zeroit(ret);
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { sycl::buffer<sobj, 1> abuff(&ret, {1});
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); theGridAccelerator->submit([&](sycl::handler &cgh) {
cgh.parallel_for(cl::sycl::range<1>{osites}, auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(sycl::range<1>{osites},
Reduction, Reduction,
[=] (cl::sycl::id<1> item, auto &sum) { [=] (sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
}); });
theGridAccelerator->wait(); }
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret); sobjD dret; convertType(dret,ret);
return dret; return dret;
} }
@@ -72,55 +71,41 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L) template<class Word> Word svm_xor(Word *vec,uint64_t L)
{ {
Word xorResult; xorResult = 0;
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
Word identity; identity=0; Word identity; identity=0;
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { Word ret = 0;
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); {
cgh.parallel_for(cl::sycl::range<1>{L}, sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (cl::sycl::id<1> index, auto &sum) { [=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index]; sum ^=vec[index];
}); });
}); });
}
theGridAccelerator->wait();
return ret;
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
auto l = index % 61;
sum ^= vec[index]<<l | vec[index]>>(64-l);
});
});
}
theGridAccelerator->wait(); theGridAccelerator->wait();
Word ret = d_sum[0];
free(d_sum,*theGridAccelerator);
return ret; return ret;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@@ -53,10 +53,10 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0); GRID_ASSERT(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){ for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1); GRID_ASSERT(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1); GRID_ASSERT(fine->_processors[d]==1);
} }
int multiplicity=1; int multiplicity=1;
@@ -66,9 +66,9 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){ for(int d=0;d<rngdims;d++){
int fd= d+lowerdims; int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]); GRID_ASSERT(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]); GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); GRID_ASSERT(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
} }
@@ -83,18 +83,18 @@ inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0); int lowerdims = fine->_ndimension - coarse->_ndimension; GRID_ASSERT(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors // assumes that the higher dimensions are not using more processors
// all further divisions are local // all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1); for(int d=0;d<lowerdims;d++) GRID_ASSERT(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]); for(int d=0;d<rngdims;d++) GRID_ASSERT(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites // then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same // check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd()); GRID_ASSERT(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly // check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); GRID_ASSERT( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites(); return fine->lSites() / coarse->lSites();
} }
@@ -177,7 +177,7 @@ public:
skip = skip<<shift; skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow GRID_ASSERT((skip >> shift)==site); // check for overflow
eng.discard(skip); eng.discard(skip);
#else #else
@@ -218,7 +218,7 @@ public:
GetState(saved,_generators[gen]); GetState(saved,_generators[gen]);
} }
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount); GRID_ASSERT(saved.size()==RngStateCount);
std::stringstream ss; std::stringstream ss;
for(int i=0;i<RngStateCount;i++){ for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" "; ss<< saved[i]<<" ";

View File

@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP) #if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { template<class vobj>
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
commVector<vobj> reduction_buffer(rd*subvol_size); deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
vobj zero_init; vobj zero_init;
zeroit(zero_init); zeroit(zero_init);
@@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device //copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy //sync after copy
accelerator_barrier(); accelerator_barrier();
@@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
#if defined(GRID_SYCL) #if defined(GRID_SYCL)
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj>
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
@@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
mysum[r] = vobj_zero; mysum[r] = vobj_zero;
} }
commVector<vobj> reduction_buffer(rd*subvol_size); deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
@@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size}, cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](cl::sycl::id<1> item, auto &sum) { [=](sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });
@@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
} }
#endif #endif
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { template<class vobj>
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector; typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2; const int osites = rd*e1*e2;
commVector<vector>buffer(osites); deviceVector<vector>buffer(osites);
vector *dat = (vector *)Data; vector *dat = (vector *)Data;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
Vector<vector> lvSum_small(rd); std::vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0]; vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) { for (int w = 0; w < words; w++) {
@@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r]; lvSum_ptr[w+words*r]=lvSum_small[r];
} }
}
} }
template<class vobj>
} inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{ {
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) { if constexpr (sizeof(vobj) <= 256) {
@@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
} }
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj>
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
@@ -208,15 +247,19 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
}); });
} }
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else #else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif #endif
} }

View File

@@ -31,15 +31,15 @@ NAMESPACE_BEGIN(Grid);
inline void subdivides(GridBase *coarse,GridBase *fine) inline void subdivides(GridBase *coarse,GridBase *fine)
{ {
assert(coarse->_ndimension == fine->_ndimension); GRID_ASSERT(coarse->_ndimension == fine->_ndimension);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
assert(coarse->_processors[d] == fine->_processors[d]); GRID_ASSERT(coarse->_processors[d] == fine->_processors[d]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[d]); GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[d]);
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); GRID_ASSERT((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
} }
@@ -309,7 +309,7 @@ inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &co
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = fineData.size(); int NBatch = fineData.size();
assert(coarseData.size() == NBatch); GRID_ASSERT(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid(); GridBase * coarse= coarseData[0].Grid();
@@ -344,7 +344,7 @@ template<class vobj,class vobj2,class CComplex>
GridBase * coarse= coarseA.Grid(); GridBase * coarse= coarseA.Grid();
fineZ.Checkerboard()=fineX.Checkerboard(); fineZ.Checkerboard()=fineX.Checkerboard();
assert(fineX.Checkerboard()==fineY.Checkerboard()); GRID_ASSERT(fineX.Checkerboard()==fineY.Checkerboard());
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
conformable(fineX,fineY); conformable(fineX,fineY);
conformable(fineX,fineZ); conformable(fineX,fineZ);
@@ -356,7 +356,7 @@ template<class vobj,class vobj2,class CComplex>
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); GRID_ASSERT(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
autoView( fineZ_ , fineZ, AcceleratorWrite); autoView( fineZ_ , fineZ, AcceleratorWrite);
@@ -613,7 +613,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// checks // checks
assert( nbasis == Basis.size() ); GRID_ASSERT( nbasis == Basis.size() );
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine); conformable(Basis[i].Grid(),fine);
@@ -687,7 +687,7 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = coarseData.size(); int NBatch = coarseData.size();
assert(fineData.size() == NBatch); GRID_ASSERT(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid(); GridBase * coarse = coarseData[0].Grid();
@@ -715,12 +715,12 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
int ni = ig->_ndimension; int ni = ig->_ndimension;
int no = og->_ndimension; int no = og->_ndimension;
assert(ni == no); GRID_ASSERT(ni == no);
for(int d=0;d<no;d++){ for(int d=0;d<no;d++){
assert(ig->_processors[d] == og->_processors[d]); GRID_ASSERT(ig->_processors[d] == og->_processors[d]);
assert(ig->_ldimensions[d] == og->_ldimensions[d]); GRID_ASSERT(ig->_ldimensions[d] == og->_ldimensions[d]);
assert(ig->lSites() == og->lSites()); GRID_ASSERT(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead); autoView(in_v,in,CpuRead);
@@ -752,16 +752,16 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
int nd = nF; int nd = nF;
assert(nF == nT); GRID_ASSERT(nF == nT);
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]); GRID_ASSERT(Fg->_processors[d] == Tg->_processors[d]);
} }
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
@@ -821,12 +821,12 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
assert(nF+1 == nT); GRID_ASSERT(nF+1 == nT);
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// do the index calc on the GPU // do the index calc on the GPU
@@ -890,12 +890,12 @@ void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, in
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
assert(nT+1 == nF); GRID_ASSERT(nT+1 == nF);
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// do the index calc on the GPU // do the index calc on the GPU
@@ -955,16 +955,16 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
assert(nl+1 == nh); GRID_ASSERT(nl+1 == nh);
assert(orthog<nh); GRID_ASSERT(orthog<nh);
assert(orthog>=0); GRID_ASSERT(orthog>=0);
assert(hg->_processors[orthog]==1); GRID_ASSERT(hg->_processors[orthog]==1);
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
assert(lg->_processors[dl] == hg->_processors[d]); GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]);
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]); GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -981,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice; hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
} }
ddl++;
}
} }
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDimv,hcoor);
@@ -999,16 +1005,17 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
assert(nl+1 == nh); GRID_ASSERT(nl+1 == nh);
assert(orthog<nh); GRID_ASSERT(orthog<nh);
assert(orthog>=0); GRID_ASSERT(orthog>=0);
assert(hg->_processors[orthog]==1); GRID_ASSERT(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
assert(lg->_processors[dl] == hg->_processors[d]); GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]);
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]); GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -1020,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl); Coordinate lcoor(nl);
Coordinate hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0;
hcoor[orthog] = slice; hcoor[orthog] = slice;
int ddl=0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
} }
} }
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDimv,hcoor);
@@ -1044,14 +1056,14 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
assert(nl == nh); GRID_ASSERT(nl == nh);
assert(orthog<nh); GRID_ASSERT(orthog<nh);
assert(orthog>=0); GRID_ASSERT(orthog>=0);
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]); GRID_ASSERT(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]); GRID_ASSERT(lg->_ldimensions[d] == hg->_ldimensions[d]);
} }
} }
Coordinate sz = lg->_ldimensions; Coordinate sz = lg->_ldimensions;
@@ -1081,7 +1093,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
subdivides(cg,fg); subdivides(cg,fg);
assert(cg->_ndimension==fg->_ndimension); GRID_ASSERT(cg->_ndimension==fg->_ndimension);
Coordinate ratio(cg->_ndimension); Coordinate ratio(cg->_ndimension);
@@ -1145,7 +1157,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
int lex; int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions); Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
assert(lex < out.size()); GRID_ASSERT(lex < out.size());
out_ptrs[lane] = &out[lex]; out_ptrs[lane] = &out[lex];
} }
@@ -1209,7 +1221,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out.Grid(); GridBase* grid = out.Grid();
assert(in.size()==grid->lSites()); GRID_ASSERT(in.size()==grid->lSites());
const int ndim = grid->Nd(); const int ndim = grid->Nd();
constexpr int nsimd = vtype::Nsimd(); constexpr int nsimd = vtype::Nsimd();
@@ -1256,7 +1268,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid; GridBase* grid = out._grid;
assert(in.size()==grid->lSites()); GRID_ASSERT(in.size()==grid->lSites());
int ndim = grid->Nd(); int ndim = grid->Nd();
int nsimd = vtype::Nsimd(); int nsimd = vtype::Nsimd();
@@ -1317,9 +1329,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{ {
assert(out.Grid()->Nd() == in.Grid()->Nd()); GRID_ASSERT(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){ for(int d=0;d<out.Grid()->Nd();d++){
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]); GRID_ASSERT(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
} }
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid(); GridBase *in_grid=in.Grid();
@@ -1370,9 +1382,9 @@ class precisionChangeWorkspace{
public: public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd()); GRID_ASSERT(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){ for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); GRID_ASSERT(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
} }
int Nsimd_out = out_grid->Nsimd(); int Nsimd_out = out_grid->Nsimd();
@@ -1537,7 +1549,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
assert(full_vecs>=1); GRID_ASSERT(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1555,18 +1567,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension); GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
assert(full[n].Checkerboard() == cb); GRID_ASSERT(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc); GRID_ASSERT(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs); GRID_ASSERT(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1610,7 +1622,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A // Loop over reordered data post A2A
thread_for(c, chunk, { thread_for(c, chunk, {
@@ -1663,7 +1675,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
assert(full_vecs>=1); GRID_ASSERT(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1681,18 +1693,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension); GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
assert(full[n].Checkerboard() == cb); GRID_ASSERT(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc); GRID_ASSERT(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs); GRID_ASSERT(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1728,7 +1740,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
auto lsites= rsites/M; // Decreases rsites by M auto lsites= rsites/M; // Decreases rsites by M
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol);
{ {
// Loop over reordered data post A2A // Loop over reordered data post A2A

View File

@@ -106,6 +106,47 @@ public:
} }
}; };
#ifdef GRID_LOG_VIEWS
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
const char* filename; int line, mode;
public:
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
v(_v), filename(_filename), line(_line), mode(_mode) {
switch (mode){
case AcceleratorRead:
case AcceleratorWrite:
case CpuRead:
case CpuWrite:
ViewLogger::LogOpen(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
};
~ViewCloser() {
switch (mode) {
case AcceleratorWriteDiscard:
case AcceleratorWrite:
case CpuWrite:
ViewLogger::LogClose(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
v.ViewClose();
}
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
#else
// Little autoscope assister // Little autoscope assister
template<class View> template<class View>
class ViewCloser class ViewCloser
@@ -119,6 +160,7 @@ class ViewCloser
#define autoView(l_v,l,mode) \ #define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \ auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v); ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
#endif
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST // Lattice expression types used by ET to assemble the AST

View File

@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
* *
*/ */
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf, template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
Lattice<vobj> &lat, Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -82,10 +82,10 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d]; int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal int rNsimda= Nsimd/simd[dim]; // should be equal
assert(rNsimda==rNsimd); GRID_ASSERT(rNsimda==rNsimd);
int face_ovol=block*nblock; int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd); // GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/ /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that //Let's make it work on GPU and then make a special accelerator_for that
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
}); });
} }
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf, template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
const Lattice<vobj> &lat, const Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -172,7 +172,7 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
int face_ovol=block*nblock; int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd); // GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/ /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that //Let's make it work on GPU and then make a special accelerator_for that
@@ -247,7 +247,7 @@ public:
Coordinate local =unpadded_grid->LocalDimensions(); Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid(); Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){ for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) assert(local[d]>=depth); if ( procs[d] > 1 ) GRID_ASSERT(local[d]>=depth);
} }
} }
void DeleteGrids(void) void DeleteGrids(void)
@@ -448,9 +448,9 @@ public:
int nld = to.Grid()->_ldimensions[dimension]; int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
assert(depth<=lds[dimension]); // A must be on neighbouring node GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node
assert(depth>0); // A caller bug if zero GRID_ASSERT(depth>0); // A caller bug if zero
assert(ld+2*depth==nld); GRID_ASSERT(ld+2*depth==nld);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations // Face size and byte calculations
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@@ -460,15 +460,21 @@ public:
} }
buffer_size = buffer_size / Nsimd; buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension]; int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); GRID_ASSERT( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static cshiftVector<vobj> send_buf; static deviceVector<vobj> send_buf;
static cshiftVector<vobj> recv_buf; static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<CommsRequest_t> fwd_req; std::vector<MpiCommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req; std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -495,9 +501,16 @@ public:
t_gather+=usecond()-t; t_gather+=usecond()-t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req, grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
for ( int d=0;d < depth ; d ++ ) { for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
t_gather+= usecond() - t; t_gather+= usecond() - t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req, grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
@@ -533,6 +553,11 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(fwd_req); grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();
@@ -543,6 +568,11 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(bwd_req); grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();

View File

@@ -69,6 +69,7 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL"); GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -84,6 +85,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogDslash.Active(0); GridLogDslash.Active(0);
GridLogComms.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1); GridLogHMC.Active(1);
@@ -97,6 +99,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);

View File

@@ -33,10 +33,6 @@
#ifndef GRID_LOG_H #ifndef GRID_LOG_H
#define GRID_LOG_H #define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -180,6 +176,7 @@ extern GridLogger GridLogError;
extern GridLogger GridLogWarning; extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage; extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug; extern GridLogger GridLogDebug;
extern GridLogger GridLogComms;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash; extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative; extern GridLogger GridLogIterative;
@@ -226,8 +223,6 @@ inline void Grid_pass(Args&&... args) {
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl; std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
} }
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() { \ #define BACKTRACEFILE() { \
char string[20]; \ char string[20]; \

View File

@@ -293,9 +293,9 @@ class BinaryIO {
// Flatten the file // Flatten the file
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
if ( control & BINARYIO_MASTER_APPEND ) { if ( control & BINARYIO_MASTER_APPEND ) {
assert(iodata.size()==1); GRID_ASSERT(iodata.size()==1);
} else { } else {
assert(lsites==iodata.size()); GRID_ASSERT(lsites==iodata.size());
} }
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
gStart[d] = lLattice[d]*pcoor[d]; gStart[d] = lLattice[d]*pcoor[d];
@@ -326,20 +326,20 @@ class BinaryIO {
// Sobj in MPI phrasing // Sobj in MPI phrasing
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
int ierr; int ierr;
ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); assert(ierr==0); ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); GRID_ASSERT(ierr==0);
ierr = MPI_Type_commit(&mpiObject); ierr = MPI_Type_commit(&mpiObject);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// File global array data type // File global array data type
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); assert(ierr==0); ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); GRID_ASSERT(ierr==0);
ierr=MPI_Type_commit(&fileArray); assert(ierr==0); ierr=MPI_Type_commit(&fileArray); GRID_ASSERT(ierr==0);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// local lattice array // local lattice array
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); assert(ierr==0); ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); GRID_ASSERT(ierr==0);
ierr=MPI_Type_commit(&localArray); assert(ierr==0); ierr=MPI_Type_commit(&localArray); GRID_ASSERT(ierr==0);
#endif #endif
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -349,8 +349,8 @@ class BinaryIO {
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
assert(ieee64||ieee32|ieee64big||ieee32big); GRID_ASSERT(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1); GRID_ASSERT((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Do the I/O // Do the I/O
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -361,9 +361,9 @@ class BinaryIO {
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO #ifdef USE_MPI_IO
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl; std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); GRID_ASSERT(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); GRID_ASSERT(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); GRID_ASSERT(ierr==0);
MPI_File_close(&fh); MPI_File_close(&fh);
MPI_Type_free(&fileArray); MPI_Type_free(&fileArray);
MPI_Type_free(&localArray); MPI_Type_free(&localArray);
@@ -384,13 +384,14 @@ class BinaryIO {
fin.seekg(offset + myrank * lsites * sizeof(fobj)); fin.seekg(offset + myrank * lsites * sizeof(fobj));
} }
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj)); fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0); GRID_ASSERT(fin.fail() == 0);
fin.close(); fin.close();
} }
timer.Stop();
grid->Barrier(); grid->Barrier();
timer.Stop();
bstimer.Start(); bstimer.Start();
ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb); ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size()); if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
@@ -435,11 +436,11 @@ class BinaryIO {
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl; std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0); GRID_ASSERT(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl; std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0); GRID_ASSERT(ierr == 0);
MPI_Offset os; MPI_Offset os;
MPI_File_get_position(fh, &os); MPI_File_get_position(fh, &os);
@@ -506,6 +507,7 @@ class BinaryIO {
offset = fout.tellp(); offset = fout.tellp();
fout.close(); fout.close();
} }
grid->Barrier();
timer.Stop(); timer.Stop();
} }

View File

@@ -289,7 +289,7 @@ class GridLimeReader : public BinaryIO {
return; return;
} }
} }
assert(0); GRID_ASSERT(0);
} }
//////////////////////////////////////////// ////////////////////////////////////////////
// Read a generic serialisable object // Read a generic serialisable object
@@ -314,7 +314,7 @@ class GridLimeReader : public BinaryIO {
} }
} }
assert(0); GRID_ASSERT(0);
} }
template<class serialisable_object> template<class serialisable_object>
@@ -348,7 +348,7 @@ class GridLimeWriter : public BinaryIO
filename= _filename; filename= _filename;
if ( boss_node ) { if ( boss_node ) {
File = fopen(filename.c_str(), "w"); File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL ); LimeW = limeCreateWriter(File); GRID_ASSERT(LimeW != NULL );
} }
} }
///////////////////////////////////////////// /////////////////////////////////////////////
@@ -368,7 +368,7 @@ class GridLimeWriter : public BinaryIO
if ( boss_node ) { if ( boss_node ) {
LimeRecordHeader *h; LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize); h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0); GRID_ASSERT(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h); limeDestroyHeader(h);
} }
return LIME_SUCCESS; return LIME_SUCCESS;
@@ -386,11 +386,11 @@ class GridLimeWriter : public BinaryIO
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl; // std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err; int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL); GRID_ASSERT(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0); err=limeWriteRecordHeader(h, LimeW); GRID_ASSERT(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0); err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); GRID_ASSERT(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0); err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
limeDestroyHeader(h); limeDestroyHeader(h);
} }
} }
@@ -431,7 +431,7 @@ class GridLimeWriter : public BinaryIO
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
GridBase *grid = field.Grid(); GridBase *grid = field.Grid();
assert(boss_node == field.Grid()->IsBoss() ); GRID_ASSERT(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field); FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
@@ -473,7 +473,7 @@ class GridLimeWriter : public BinaryIO
if ( boss_node ) { if ( boss_node ) {
fseek(File,0,SEEK_END); fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl; uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize); GRID_ASSERT( (offset2-offset1) == PayloadSize);
} }
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@@ -481,7 +481,7 @@ class GridLimeWriter : public BinaryIO
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
if ( boss_node ) { if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0); err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
} }
//////////////////////////////////////// ////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO // Write checksum element, propagaing forward from the BinaryIO
@@ -621,8 +621,8 @@ class IldgWriter : public ScidacWriter {
uint64_t PayloadSize = LFN.size(); uint64_t PayloadSize = LFN.size();
int err; int err;
createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize); createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0); err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); GRID_ASSERT(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0); err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0);
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -656,7 +656,7 @@ class IldgWriter : public ScidacWriter {
header.sequence_number = sequence; header.sequence_number = sequence;
header.ildg_lfn = LFN; header.ildg_lfn = LFN;
assert ( (format == std::string("IEEE32BIG")) GRID_ASSERT ( (format == std::string("IEEE32BIG"))
||(format == std::string("IEEE64BIG")) ); ||(format == std::string("IEEE64BIG")) );
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@@ -676,8 +676,8 @@ class IldgWriter : public ScidacWriter {
ildgfmt.ly = header.dimension[1]; ildgfmt.ly = header.dimension[1];
ildgfmt.lz = header.dimension[2]; ildgfmt.lz = header.dimension[2];
ildgfmt.lt = header.dimension[3]; ildgfmt.lt = header.dimension[3];
assert(header.nd==4); GRID_ASSERT(header.nd==4);
assert(header.nd==header.dimension.size()); GRID_ASSERT(header.nd==header.dimension.size());
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Field norm tests // Field norm tests
@@ -734,7 +734,7 @@ class IldgReader : public GridLimeReader {
Coordinate dims = Umu.Grid()->FullDimensions(); Coordinate dims = Umu.Grid()->FullDimensions();
assert(dims.size()==4); GRID_ASSERT(dims.size()==4);
// Metadata holders // Metadata holders
ildgFormat ildgFormat_ ; ildgFormat ildgFormat_ ;
@@ -793,10 +793,10 @@ class IldgReader : public GridLimeReader {
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG"); if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG"); if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
assert( ildgFormat_.lx == dims[0]); GRID_ASSERT( ildgFormat_.lx == dims[0]);
assert( ildgFormat_.ly == dims[1]); GRID_ASSERT( ildgFormat_.ly == dims[1]);
assert( ildgFormat_.lz == dims[2]); GRID_ASSERT( ildgFormat_.lz == dims[2]);
assert( ildgFormat_.lt == dims[3]); GRID_ASSERT( ildgFormat_.lt == dims[3]);
found_ildgFormat = 1; found_ildgFormat = 1;
} }
@@ -813,10 +813,10 @@ class IldgReader : public GridLimeReader {
format = FieldMetaData_.floating_point; format = FieldMetaData_.floating_point;
assert(FieldMetaData_.dimension[0] == dims[0]); GRID_ASSERT(FieldMetaData_.dimension[0] == dims[0]);
assert(FieldMetaData_.dimension[1] == dims[1]); GRID_ASSERT(FieldMetaData_.dimension[1] == dims[1]);
assert(FieldMetaData_.dimension[2] == dims[2]); GRID_ASSERT(FieldMetaData_.dimension[2] == dims[2]);
assert(FieldMetaData_.dimension[3] == dims[3]); GRID_ASSERT(FieldMetaData_.dimension[3] == dims[3]);
found_FieldMetaData = 1; found_FieldMetaData = 1;
} }
@@ -866,13 +866,13 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum // Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format // Since this is an ILDG reader require ILDG format
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
assert(found_ildgLFN); GRID_ASSERT(found_ildgLFN);
assert(found_ildgBinary); GRID_ASSERT(found_ildgBinary);
assert(found_ildgFormat); GRID_ASSERT(found_ildgFormat);
assert(found_scidacChecksum); GRID_ASSERT(found_scidacChecksum);
// Must find something with the lattice dimensions // Must find something with the lattice dimensions
assert(found_FieldMetaData||found_ildgFormat); GRID_ASSERT(found_FieldMetaData||found_ildgFormat);
if ( found_FieldMetaData ) { if ( found_FieldMetaData ) {
@@ -880,9 +880,9 @@ class IldgReader : public GridLimeReader {
} else { } else {
assert(found_ildgFormat); GRID_ASSERT(found_ildgFormat);
const std::string stNC = std::to_string( Nc ) ; const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") ); GRID_ASSERT ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can // Populate our Grid metadata as best we can
@@ -927,20 +927,20 @@ class IldgReader : public GridLimeReader {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16); FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16); FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb); scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
assert( scidac_csuma ==FieldMetaData_.scidac_checksuma); GRID_ASSERT( scidac_csuma ==FieldMetaData_.scidac_checksuma);
assert( scidac_csumb ==FieldMetaData_.scidac_checksumb); GRID_ASSERT( scidac_csumb ==FieldMetaData_.scidac_checksumb);
std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl; std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
} else { } else {
std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl; std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
assert(0); // Can I insist always checksum ? GRID_ASSERT(0); // Can I insist always checksum ?
} }
if ( found_FieldMetaData || found_usqcdInfo ) { if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker; FieldMetaData checker;
stats Stats; stats Stats;
Stats(Umu,checker); Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); GRID_ASSERT(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); GRID_ASSERT(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
} }
} }

View File

@@ -203,7 +203,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm) inline void reconstruct3(LorentzColourMatrix & cm)
{ {
assert( Nc < 4 && Nc > 1 ) ; GRID_ASSERT( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
#if Nc == 2 #if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ; cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
@@ -240,7 +240,7 @@ struct BinarySimpleUnmunger {
sobj_stype *in_buffer = (sobj_stype *)&in; sobj_stype *in_buffer = (sobj_stype *)&in;
size_t fobj_words = sizeof(out) / sizeof(fobj_stype); size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
size_t sobj_words = sizeof(in) / sizeof(sobj_stype); size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
assert(fobj_words == sobj_words); GRID_ASSERT(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++) for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly out_buffer[word] = in_buffer[word]; // type conversion on the fly
@@ -259,7 +259,7 @@ struct BinarySimpleMunger {
sobj_stype *out_buffer = (sobj_stype *)&out; sobj_stype *out_buffer = (sobj_stype *)&out;
size_t fobj_words = sizeof(in) / sizeof(fobj_stype); size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
size_t sobj_words = sizeof(out) / sizeof(sobj_stype); size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
assert(fobj_words == sobj_words); GRID_ASSERT(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++) for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly out_buffer[word] = in_buffer[word]; // type conversion on the fly

View File

@@ -76,7 +76,7 @@ public:
removeWhitespace(line); removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl; std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER")); GRID_ASSERT(line==std::string("BEGIN_HEADER"));
do { do {
getline(fin,line); // read one line getline(fin,line); // read one line
@@ -106,9 +106,9 @@ public:
field.dimension[2] = std::stol(header["DIMENSION_3"]); field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]); field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4); GRID_ASSERT(grid->_ndimension == 4);
for(int d=0;d<4;d++){ for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]); GRID_ASSERT(grid->_fdimensions[d]==field.dimension[d]);
} }
field.link_trace = std::stod(header["LINK_TRACE"]); field.link_trace = std::stod(header["LINK_TRACE"]);
@@ -183,7 +183,7 @@ public:
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
} else { } else {
assert(0); GRID_ASSERT(0);
} }
GaugeStats Stats; Stats(Umu,clone); GaugeStats Stats; Stats(Umu,clone);
@@ -205,9 +205,9 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl; std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0); exit(0);
} }
if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); if(exitOnReadPlaquetteMismatch()) GRID_ASSERT(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); GRID_ASSERT(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum ); GRID_ASSERT(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
@@ -246,7 +246,7 @@ public:
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); GRID_ASSERT(header.nd==4);
GaugeStats Stats; Stats(Umu,header); GaugeStats Stats; Stats(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
@@ -302,7 +302,7 @@ public:
GridBase *grid = parallel.Grid(); GridBase *grid = parallel.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); GRID_ASSERT(header.nd==4);
header.link_trace=0.0; header.link_trace=0.0;
header.plaquette=0.0; header.plaquette=0.0;
MachineCharacteristics(header); MachineCharacteristics(header);
@@ -355,16 +355,16 @@ public:
std::string data_type(header.data_type); std::string data_type(header.data_type);
#ifdef RNG_RANLUX #ifdef RNG_RANLUX
assert(format == std::string("UINT64")); GRID_ASSERT(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48")); GRID_ASSERT(data_type == std::string("RANLUX48"));
#endif #endif
#ifdef RNG_MT19937 #ifdef RNG_MT19937
assert(format == std::string("UINT32")); GRID_ASSERT(format == std::string("UINT32"));
assert(data_type == std::string("MT19937")); GRID_ASSERT(data_type == std::string("MT19937"));
#endif #endif
#ifdef RNG_SITMO #ifdef RNG_SITMO
assert(format == std::string("UINT64")); GRID_ASSERT(format == std::string("UINT64"));
assert(data_type == std::string("SITMO")); GRID_ASSERT(data_type == std::string("SITMO"));
#endif #endif
// depending on datatype, set up munger; // depending on datatype, set up munger;
@@ -376,7 +376,7 @@ public:
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl; std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0); exit(0);
} }
assert(nersc_csum == header.checksum ); GRID_ASSERT(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
} }

Some files were not shown because too many files have changed in this diff Show More