1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-04-29 15:06:00 +01:00

Compare commits

..

347 Commits

Author SHA1 Message Date
Chulwoo Jung 09aa843984 Changed batchedInnerProduct for portability 2026-03-17 18:54:18 -04:00
Chulwoo Jung 24752002fa Verbosity reduction batched inner product for reorthogonalization 2026-03-17 13:02:16 -04:00
Chulwoo Jung f3223021fd RestartedLanczosBidiagonalization seems to have been fixed 2026-03-16 14:34:56 -04:00
Chulwoo Jung 4e1d95d3bb Claude implementation of Thick Restarted Lanczos Bidiagonalization 2026-03-13 19:12:54 -04:00
Chulwoo Jung 2ed38f63ca Merge branch 'develop' of https://github.com/paboyle/Grid into KS_shifted 2026-03-12 10:49:21 -04:00
Chulwoo Jung 80d2a8d88d wqMerge branch 'develop' of https://github.com/paboyle/Grid into KS_shifted 2026-03-11 21:49:26 -04:00
Peter Boyle 595ceaac37 Include grid header and make the ENABLE correct 2026-03-11 17:24:44 -04:00
Peter Boyle daf5834e8e Fixing incorrect PR about disable fermion instantiations 2026-03-11 17:05:46 -04:00
Chulwoo Jung 2ac5431401 Turning of NERSC header checking 2026-03-06 14:16:20 -05:00
Peter Boyle 0d8658a039 Optimised 2026-03-05 06:06:32 -05:00
Peter Boyle 095e004d01 Setup change GCR 2026-03-05 06:06:32 -05:00
Peter Boyle 0acabee7f6 Modest change 2026-03-05 06:06:32 -05:00
Peter Boyle 76fbcffb60 Improvement to 16^3 hdcg 2026-03-05 06:06:32 -05:00
Peter Boyle a0a62d7ead Merge pull request #478 from vataspro/PolyakovUpstream
Spatial Polyakov Loop implementation
2026-02-24 20:45:42 -05:00
Peter Boyle c5038ea6a5 Merge pull request #483 from cmcknigh/bugfix/rocm7-rocblas-type-refactor
Adding a version check to handle rocBlas type refactor
2026-02-24 20:45:03 -05:00
Peter Boyle a5120903eb Merge pull request #486 from RChrHill/fix/sp4-fp32
Define Sp4 ProjectOnGeneralGroup for generic vtype
2026-02-24 20:44:08 -05:00
Peter Boyle 00b286a08a Merge pull request #488 from RChrHill/feature/additional-ET-traces
Add ET support for Lattice spin- and colour-traces
2026-02-24 20:43:45 -05:00
Peter Boyle 24a9759353 Merge pull request #485 from edbennett/skip-fermion-instantiations
Be able to skip compiling fermion instantiations altogether
2026-02-24 20:43:20 -05:00
edbennett 1b56f6f46d be able to skip compiling fermion instantiations altogether 2026-02-24 23:52:18 +00:00
Peter Boyle 2a8084d569 Subspace setup 2026-02-13 17:26:11 -05:00
Peter Boyle 6ff29f9d4f Alternate multigrids 2026-02-13 17:25:45 -05:00
RChHill c4d3e79193 Add ET support for Lattice spin- and colour-traces 2026-01-29 14:46:52 +00:00
Chulwoo Jung 3e71cac7ae Merge branch 'specflow2' of github.com:chulwoo1/Grid into KS_shifted 2026-01-15 03:40:52 +00:00
Chulwoo Jung e8e7ef08fc KrylovSchur and spectral flow updates 2026-01-15 03:20:01 +00:00
Chulwoo Jung 5c00fe6bef Merge branch 'develop' of github.com:poare/Grid into KS_shifted 2026-01-12 06:26:15 +00:00
Chulwoo Jung 3175788f97 Added explicit shift before pulling 2026-01-12 06:25:09 +00:00
Patrick Oare 6f1788bb38 modified Givens rotation to implement a sparse multiplication 2026-01-06 16:19:48 -05:00
Peter Boyle 7cd3f21e6b preserving a bunch of experiments on setup and g5 subspace doubling 2026-01-06 05:57:39 -05:00
Chulwoo Jung dcda74f924 Timing info for schurReorder,etc 2025-12-18 18:23:50 +00:00
Chulwoo Jung df4c2a082b Reducing comments 2025-12-09 14:23:22 +00:00
Chulwoo Jung 88611659a3 Appear to be working 2025-12-08 21:08:14 -05:00
Chulwoo Jung 504b85dfc0 Restarting and adding codes back in 2025-12-08 13:27:06 -05:00
Chulwoo Jung 43ea83e5e1 Checking in to move back to genoa 2025-12-05 23:56:40 +00:00
Chulwoo Jung 376150c3df Adding 2025-12-04 21:29:31 -05:00
Chulwoo Jung 842e0391e7 Checking in to move back to aurora 2025-12-04 20:13:44 -05:00
Chulwoo Jung 17e3799bcc Necessary code for Harmonic KS added 2025-12-03 19:38:45 -05:00
Chulwoo Jung 985ab70f85 Checking in without adjusting Nk 2025-12-03 14:46:34 -05:00
Chulwoo Jung 1e85081986 Adding shift and debugging 2025-12-03 00:16:51 -05:00
Chulwoo Jung 3876fe5a29 Merge branch 'KrylovSchur' of github.com:chulwoo1/Grid into KS_shifted 2025-12-02 17:46:40 -05:00
Chulwoo Jung 6692425aa2 Checking in before pulling 2025-11-26 17:17:22 -05:00
Chulwoo Jung d5ac4fc67f Starting to modified KS 2025-11-26 22:13:27 +00:00
Chulwoo Jung 3538faf449 Starting Harmonic (shift and inverse) 2025-11-24 17:05:35 -05:00
paboyle 4a0aaf0786 Fix issue with Aurora compilers 2025-11-21 21:41:13 +00:00
paboyle 9c3835524c Fix compile warn 2025-11-21 21:41:12 +00:00
paboyle 549351bb8a Stag verbose clean up 2025-11-20 18:22:57 +00:00
RChHill b650b89682 Define Sp4 ProjectOnGeneralGroup for generic vtype 2025-11-19 13:26:52 +00:00
Peter Boyle 74e6b19f83 Looks like the reuse of xfers in staggered has bugs or corner cases depending on volume 2025-11-17 22:29:06 -05:00
Peter Boyle 2e684028de Improvements 2025-11-14 18:12:27 -05:00
Patrick Oare 0b457b9d52 fixed ritz estimate bug 2025-11-07 18:56:08 +00:00
Chulwoo Jung fe0ab5f1a9 Merge branch 'develop' of github.com:poare/Grid into develop 2025-11-07 15:50:22 +00:00
Chulwoo Jung caa66418bd Checking in before pulling 2025-11-06 22:44:05 +00:00
paboyle c54d87a472 Aurora compile fix for new compiler 2025-11-06 18:17:33 +00:00
Allen McKnight 4304245c1b Merge branch 'develop' into bugfix/rocm7-rocblas-type-refactor 2025-11-04 08:50:11 -06:00
Chulwoo Jung 786496f22e Checking in before pulling KrylovSchur 2025-11-03 21:18:56 +00:00
Patrick R Oare 68af1bba67 commented some slow code out 2025-10-31 11:47:29 -04:00
Patrick Oare bf2a715ef7 bug in wilson eigenvectors: ritz estimates not equalling deviation from being an evec 2025-10-31 15:31:46 +00:00
Patrick Oare 4042ebf1bf added ImNorm to sort 2025-10-20 19:01:53 +00:00
Peter Boyle 6165931afa Update GridStd.h 2025-10-03 14:35:37 -04:00
Your Name 1d1fd3bcaf adding a version check to handle rocblas type change 2025-10-02 15:24:24 -05:00
Patrick R Oare 82f35001ff small bug fix for wilson spectrum since we're actually running DWF 2025-09-25 15:36:42 -04:00
Patrick Oare fa30c791aa updated wilson spec 2025-09-23 15:24:50 +00:00
Patrick Oare 612049f1dd commented out evec writer because it was taking up all the space on SDCC 2025-09-18 15:09:31 -04:00
Patrick Oare 0b92ef990c found bug in unprec DWF: was using |\cdot| in comparison for the eigenvalue sorting 2025-09-12 13:31:39 -04:00
Patrick Oare 82d411ca7b added inline to rf functions 2025-09-10 17:16:48 -04:00
Patrick Oare 597086a031 added wilson spectrum example 2025-09-10 15:41:00 -04:00
Patrick Oare b210ddf9a7 added commented out line to run un-preconditioned DWF 2025-09-09 15:14:11 -04:00
Patrick Oare c5d02e5799 updated RitzFilter enum and the input to run krylov schur 2025-09-09 13:02:11 -04:00
Patrick Oare 9dcd7ca761 added IO for evecs / evals 2025-09-08 12:59:48 -04:00
paboyle 23581333e6 link cufft 2025-08-21 22:25:55 +01:00
paboyle e5fa3d887f Compile on CUDA 2025-08-21 22:10:27 +01:00
paboyle 583fa7bb0a FFTW guarded after CUDA adn HIP 2025-08-21 22:00:12 +01:00
Peter Boyle fe0db53842 FFT offload to GPU and MUCH faster comms.
40x speed up on Frontier
2025-08-21 16:45:38 -04:00
Peter Boyle 76c0ada1e1 Benchmark for En Hung 2025-08-21 16:45:38 -04:00
Peter Boyle 92f49e9194 Merge pull request #482 from g-simonetti/wflow_sp2n_paboyle
Fixed Wilson flow for Nc not equal to 3
2025-08-21 09:10:25 -04:00
Peter Boyle 44c8057b5f Merge pull request #481 from vataspro/sp-reps-fix
Only compile higher fermion representations for symplectic gauge group when requested via configure flag
2025-08-20 12:57:28 -04:00
Alexis Provatas 0ad837f595 Fix Sp representations compilation 2025-08-20 17:48:39 +01:00
Peter Boyle bd2103c746 Merge pull request #480 from vataspro/fix-no-comms
Fix enable-comms=none
2025-08-20 12:26:47 -04:00
Alexis Provatas 9c18d2ddb0 Fix StencilSendToRecvFromBegin to agree with base 2025-08-20 17:17:06 +01:00
g-simonetti 1245a8c151 num_colours added to class S 2025-08-20 16:27:34 +01:00
g-simonetti 07113dc8ba Changed beta=3 to beta=Nc with comments 2025-08-20 16:18:34 +01:00
Chulwoo Jung c1e5ef9476 Adding config input 2025-08-15 20:52:36 +00:00
Patrick Oare 6fd71aea9d may have found bug 2025-08-15 12:13:01 -04:00
Patrick Oare a18b0d496c added more debug output 2025-08-15 11:51:15 -04:00
paboyle a3420e6fa9 Update for grid view logging 2025-08-14 21:29:20 +00:00
paboyle 732836d9f8 Missed one 2025-08-14 20:25:54 +00:00
paboyle 87658f7b53 ASSERT tripped in Shuhei's branch 2025-08-14 20:08:54 +00:00
Patrick Oare 19f0737b98 trying one more thing 2025-08-14 14:47:38 -04:00
Patrick Oare 16d3c9cf75 added another debug feature 2025-08-14 14:37:49 -04:00
Patrick Oare 3b9fc72451 modified debug output slightly 2025-08-14 14:06:32 -04:00
Patrick Oare 99644f5d0a commented out arg assert in kryschur spec example 2025-08-13 14:57:57 -04:00
paboyle e7f51e5fb1 Timer pointers for hadrons compat.
Reluctantly, this interface is silly to pass timers around.
2025-08-11 21:11:36 +01:00
Peter Boyle 1ce5f70dd1 Update GridStd.h 2025-08-11 12:20:54 -04:00
Peter Boyle 473635f401 Update BinaryIO.h 2025-08-11 11:06:06 -04:00
paboyle 5adf2657dd Updated to compile and run fast on CUDA 2025-08-10 00:00:13 +01:00
paboyle 82cfff2990 A2A meson field BLAS based momentum project 2025-08-07 15:51:15 +00:00
paboyle 4397b1c442 Debugged momentum projection for A2A Meson Field 2025-08-07 15:51:01 +00:00
paboyle 9e6a4a4737 Assertion updates to macros (mostly) with backtrace.
WIlson flow to include options for DBW2, Iwasaki, Symanzik.
View logging for data assurance
2025-08-07 15:48:38 +00:00
Chulwoo Jung 7780d88d26 Adding simple lanczos, boundary to specflow(!) 2025-08-06 23:41:53 +00:00
Chulwoo Jung 2bf9179d2c Adding mass step 2025-08-06 16:52:51 +00:00
Chulwoo Jung c606f5dca0 Move out src initialization for re-use / Adding antiperiodic BC 2025-08-06 16:51:14 +00:00
Patrick Oare 632f5916c7 small log change for KS 2025-08-04 15:43:23 -04:00
Patrick Oare 9057694895 added double orthog to KS 2025-08-04 15:30:18 -04:00
Patrick Oare 5e85aef19d added updates to GCR polynomial code 2025-07-31 16:42:35 -04:00
Patrick Oare 2b6d40c7e1 added example files 2025-07-31 16:41:28 -04:00
Patrick Oare 33b80c4e8e added eigensolver code for arnoldi and krylov schur 2025-07-31 16:40:24 -04:00
paboyle 41f344bbd3 Merge with Christoph GPT checksum debug 2025-07-15 03:06:09 +00:00
Chulwoo Jung 8419cc5c64 specflow evec I/O added, 2025-07-11 15:57:23 -04:00
paboyle a77cd50b2f Update comms logging in Cshift 2025-07-11 14:36:10 +00:00
paboyle 73af020f98 improved 2025-06-27 06:08:54 +00:00
paboyle bffb83c46e std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;

--dylib-map : Grid prints its dylib regions
--heartbeat : itimer based / SIGALRM wake up which seems to make Aurora
more stable
--debug-heartbeat : periodically report to stderr where we are in code

Now have libunwind option (configure: --with-unwind=<prefix>) to give an
Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.
2025-06-27 06:08:54 +00:00
paboyle 7031f37350 Use libunwind for backtrace as it is signal asynch safe 2025-06-27 06:08:54 +00:00
paboyle 829dd74cb2 Verbose change 2025-06-27 06:08:54 +00:00
paboyle 66e671985d P2P 2025-06-27 06:08:54 +00:00
paboyle 5afcbcf0f3 Cshift uses flight recorder 2025-06-27 06:08:54 +00:00
paboyle 9730579312 Simplify and verbose 2025-06-27 06:08:51 +00:00
paboyle bfae14d035 More flight logging 2025-06-27 06:07:34 +00:00
paboyle b78fc73d19 Better signal handler 2025-06-27 06:07:34 +00:00
Peter Boyle 709f8ae76c Update README 2025-06-26 23:06:11 -04:00
Peter Boyle 7aa06329d0 Update for new stencil compression options 2025-06-17 18:06:19 +02:00
Peter Boyle 9d6a38c44c Compressed comms options as Sloppy 2025-06-17 16:43:53 +02:00
Peter Boyle 6ec5cee368 Preparing for compressed comms 2025-06-17 16:38:10 +02:00
Peter Boyle f2e9a68825 Simplify 2025-06-13 17:32:05 +02:00
Peter Boyle d88750e6b6 Sloppy + non-sloppy 2025-06-13 16:42:01 +02:00
Peter Boyle 821358eda7 Remove partial dirichlet. Favour intro reduced prec comms options 2025-06-13 05:08:45 +02:00
Peter Boyle fce6e1f135 Kill core files for quota reasons 2025-06-13 05:08:15 +02:00
Peter Boyle 8f0bb3e676 remove partial dirichlet 2025-06-13 05:07:56 +02:00
Peter Boyle 262c70d967 USe sloppy comms options 2025-06-13 05:07:23 +02:00
Peter Boyle da43ef7c2d REmove partial dirichlet option. It's going nowhere 2025-06-13 05:05:15 +02:00
Peter Boyle 7b60ab5df1 Warning suppress 2025-06-13 05:04:55 +02:00
Peter Boyle f6b961a64e Warning suppress 2025-06-13 05:04:47 +02:00
Peter Boyle f1ed988aa3 Interface to reduced precision comms 2025-06-13 05:04:12 +02:00
Peter Boyle eea51bb604 Suppress annoying warns 2025-06-13 05:03:36 +02:00
Peter Boyle 9203126aa5 Scripts 2025-06-11 15:30:16 +02:00
Peter Boyle f90ba4712a Update for Jupiter 2025-06-11 15:24:34 +02:00
Peter Boyle 3737a24096 Updated python output 2025-06-03 14:09:29 -04:00
paboyle d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
paboyle 25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
Peter Boyle dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
Peter Boyle 5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
Peter Boyle 2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
Peter Boyle cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
Peter Boyle b517e88db3 Update README 2025-05-13 16:49:21 -04:00
paboyle bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
paboyle 644cc6647e JSON update 2025-05-13 12:50:58 +00:00
paboyle 72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
Alexis Provatas c646d91527 Fix names, protect against bad index values, clean docstrings 2025-05-01 10:52:00 +01:00
Alexis Provatas a2b98d82e1 remove obsolete spatial polyakov observable file 2025-05-01 10:52:00 +01:00
Alexis Provatas 7b9415c088 Move observable logger to Polyakov Loop file and fix docstring 2025-05-01 10:52:00 +01:00
Alexis Provatas cb7110f492 Add Spatial Polyakov Loop observable 2025-05-01 10:52:00 +01:00
Alexis Provatas 0c7af66490 Create Spatial Polyakov Observable Module 2025-05-01 10:52:00 +01:00
Alexis Provatas 496d1b914a Generalise Polyakov loop and overload for temporal direction 2025-05-01 10:52:00 +01:00
Peter Boyle d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
Peter Boyle bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
Chulwoo Jung 2cc6deb8e0 Merge branch 'develop' of https://github.com/paboyle/Grid into ic2 2025-04-25 10:48:41 -04:00
Chulwoo Jung 19d0590579 Checking in for merging 2025-04-25 10:48:22 -04:00
Peter Boyle 677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
Peter Boyle be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
Peter Boyle df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
Peter Boyle 21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
Peter Boyle dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
Peter Boyle ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
Peter Boyle c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
Peter Boyle 6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
Peter Boyle b75809ed61 Update README 2025-04-24 14:27:22 -04:00
Peter Boyle ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
Peter Boyle 6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
Peter Boyle 233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
Peter Boyle 7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
Chulwoo Jung a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
Chulwoo Jung cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
Christoph Lehner 96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
Christoph Lehner 7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
Peter Boyle e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
Peter Boyle a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
Peter Boyle cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
Peter Boyle 4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
Peter Boyle 11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
Peter Boyle 6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
Peter Boyle 938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
Peter Boyle 3811d19298 Fence 2025-04-04 18:35:06 -04:00
Peter Boyle 83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
Peter Boyle d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
Peter Boyle adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
Peter Boyle ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
Peter Boyle 4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
Peter Boyle 130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
Peter Boyle 8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
Peter Boyle 0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
Peter Boyle 9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
Peter Boyle 882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
Mashy Green e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
Mashy Green d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
Peter Boyle 199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
Christoph Lehner fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
Christoph Lehner e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
Christoph Lehner d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
paboyle 25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
paboyle 19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
Mashy Green 785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
Mashy Green 1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
Mashy Green 0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
Christoph Lehner 9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
Peter Boyle 3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
paboyle 1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
Peter Boyle a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
Peter Boyle ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
Peter Boyle e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
Peter Boyle 795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
Peter Boyle 267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
Peter Boyle 3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
Peter Boyle bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
Peter Boyle eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
paboyle c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
paboyle 6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
Peter Boyle 311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
paboyle 438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
paboyle b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
Muhammad Asif b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
Muhammad Asif cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
Mashy Green 717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
Mashy Green 98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
Mashy Green fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
Mashy Green d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
Mashy Green 7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
paboyle ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
paboyle 4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
paboyle 44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
paboyle a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
paboyle 382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
paboyle 6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
paboyle 4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
paboyle 1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
paboyle 93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
paboyle 18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
paboyle 4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
paboyle 0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
edbennett 8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
edbennett 09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
edbennett 1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
Mashy Green 355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
paboyle b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
paboyle de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
Peter Boyle c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
Christoph Lehner 84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
paboyle c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
paboyle 8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
paboyle 94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
Mashy Green 4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
Mashy Green aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
paboyle d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
paboyle 74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
paboyle 1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
Chulwoo Jung 570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
Chulwoo Jung a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
Peter Boyle 3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
Chulwoo Jung f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
Chulwoo Jung 2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
Mashy Green d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
Mashy Green e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
Mashy Green 3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
Mashy Green 47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
paboyle 8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
Peter Boyle 5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
Peter Boyle b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
Peter Boyle eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
Peter Boyle 2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
Peter Boyle 1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
Peter Boyle d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
Peter Boyle 63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
Peter Boyle 368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
Peter Boyle 5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
Peter Boyle 655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
Peter Boyle 565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
Peter Boyle 62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
Peter Boyle 5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
Peter Boyle 4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
Peter Boyle 955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
Peter Boyle 11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
Peter Boyle 8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
Peter Boyle ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
paboyle 5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
Peter Boyle 6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
paboyle a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
paboyle 2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
paboyle 03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
paboyle febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
paboyle 4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
paboyle 5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
Peter Boyle f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
paboyle b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
paboyle 54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
paboyle fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
paboyle c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
paboyle be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
paboyle 68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
paboyle ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
paboyle beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
paboyle 2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
paboyle 295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
paboyle 7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
Peter Boyle ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
Peter Boyle 2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
Peter Boyle 1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
Peter Boyle 3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
Peter Boyle 35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
Peter Boyle bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
Peter Boyle f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
paboyle 9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
paboyle 02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
paboyle e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
paboyle 066544281f Deprecate UVM 2024-09-17 13:34:27 +00:00
paboyle 11be10d2c0 Aurora testing 2024-09-10 18:11:52 +00:00
paboyle 160969a758 UVM tester, doesn't turn up anything 2024-09-10 18:09:42 +00:00
paboyle 622f78ebea SYCL updates -- operator = giving trouble on Aurora.
SYCL reduction is failing intermittently with SVM interface - returns
zero, expect non-zero.
Think I need to remove ALL dependence on SVM.
2024-09-04 13:53:48 +00:00
Peter Boyle aa67a5b095 Rename 2024-08-27 19:54:01 +00:00
Peter Boyle af9ea0864c Blas fix 2024-08-27 19:53:09 +00:00
Peter Boyle 4e2a6d87c4 Gemm batched fix 2024-08-27 19:24:05 +00:00
Peter Boyle a465ecece9 Aurora 2024-08-27 19:20:43 +00:00
Peter Boyle 575eb72182 Converges on 16^3 2024-08-27 19:20:38 +00:00
Peter Boyle 3a973914d6 Compile on frontier 2024-08-27 14:55:42 -04:00
Peter Boyle f568c07bbd Improved the BLAS benchmark 2024-08-27 14:53:54 -04:00
Peter Boyle 2c9878fc3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-08-27 12:05:46 -04:00
Peter Boyle 27b1b1b005 Checkerboard available for offloading pickCheckerboard 2024-08-27 12:04:09 -04:00
Peter Boyle 130d7ab077 Verbose changes 2024-08-27 12:03:28 -04:00
Peter Boyle 29f6b8a74a Setup 2024-08-27 12:02:49 -04:00
Peter Boyle 9779aaea33 16^3 optimise 2024-08-27 11:38:35 -04:00
Peter Boyle ec25604a67 Fastest solver for mrhs multigrid 2024-08-27 11:32:34 -04:00
Peter Boyle 3668e81c5e Extract slice working on checkerboard field for Block Lanczos 2024-08-27 11:31:30 -04:00
Peter Boyle d66b2423cb Move slice operations to GPU for BlockCG 2024-08-27 11:28:47 -04:00
Peter Boyle 15cc78f0b6 peek/poke local site on checkerboard arrays 2024-08-27 11:23:42 -04:00
Peter Boyle 06db4ddea2 Fast init on GPU 2024-08-27 11:22:33 -04:00
Peter Boyle 6cfb90e99f Support needed for accelerator resident set/pick Checkerboard 2024-08-27 11:19:00 -04:00
Peter Boyle d8be95a2a3 Don't early terminate power method to get more accurate top EV 2024-08-27 11:17:37 -04:00
Peter Boyle f82702872d Normal residual 2024-08-27 11:16:44 -04:00
Peter Boyle 3752c49ef0 Add option to record the CG polynomial 2024-08-27 11:14:35 -04:00
Peter Boyle fe65fa4988 MulMatrix 2024-08-27 11:13:18 -04:00
Peter Boyle 1fe4c205a3 Adef 2024-08-27 11:11:47 -04:00
Peter Boyle d4dc5e0f43 BlockCG linalg acceleratoin with BLAS 2024-08-27 11:08:33 -04:00
Peter Boyle 77944437ce Functor initialisation 2024-08-27 11:01:02 -04:00
Peter Boyle c164bff758 MMdag 2024-08-27 11:00:36 -04:00
Peter Boyle aa2e3d954a MMdag operator 2024-08-27 10:59:29 -04:00
Peter Boyle de62b04728 Block CG linalg acceleration 2024-08-27 10:58:54 -04:00
Peter Boyle d0bdb50f24 Analyse power spectrum 2024-08-27 10:58:19 -04:00
Peter Boyle a8fecbc609 BlockCG linalg via BLAS 2024-08-21 16:08:16 -04:00
edbennett 8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
Peter Boyle e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
Peter Boyle ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
474 changed files with 26663 additions and 7645 deletions
+419 -346
View File
@@ -12,15 +12,13 @@
#include <iostream> #include <iostream>
#include <sys/time.h> #include <sys/time.h>
#define GRID_SYCL
#undef GRID_HIP
#undef GRID_CUDA
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hipblas/hipblas.h> #include <hipblas/hipblas.h>
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cublas_v2.h> #include <cublas_v2.h>
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
#include <oneapi/mkl.hpp> #include <oneapi/mkl.hpp>
@@ -45,6 +43,90 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
#endif
#ifdef GRID_HIP
hipStream_t copyStream;
hipStream_t computeStream;
void acceleratorInit(void)
{
int device = 0;
auto discard = hipSetDevice(device);
discard = hipStreamCreate(&copyStream);
discard = hipStreamCreate(&computeStream);
printf("AcceleratorHIPInit\n");
}
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
}
return ptr;
};
inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
#define accelerator_barrier(dummy) \
{ \
auto tmp=hipStreamSynchronize(computeStream); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
#endif
#ifdef GRID_CUDA
cudaStream_t copyStream;
cudaStream_t computeStream;
void acceleratorInit(void)
{
int device = 0;
cudaSetDevice(device);
cudaStreamCreate(&copyStream);
cudaStreamCreate(&computeStream);
}
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
#define accelerator_barrier(dummy) \
{ \
cudaStreamSynchronize(computeStream); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("accelerator_barrier(): Cuda error %s \n", \
cudaGetErrorString( err )); \
printf("File %s Line %d\n",__FILE__,__LINE__); \
fflush(stdout); \
if (acceleratorAbortOnGpuError) GRID_ASSERT(err==cudaSuccess); \
} \
}
#endif
template<class T> void acceleratorPut(T& dev,T&host) template<class T> void acceleratorPut(T& dev,T&host)
{ {
acceleratorCopyToDevice(&host,&dev,sizeof(T)); acceleratorCopyToDevice(&host,&dev,sizeof(T));
@@ -55,9 +137,6 @@ template<class T> T acceleratorGet(T& dev)
acceleratorCopyFromDevice(&dev,&host,sizeof(T)); acceleratorCopyFromDevice(&dev,&host,sizeof(T));
return host; return host;
} }
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
#endif
/************************************************************** /**************************************************************
* Allocator * Allocator
@@ -89,7 +168,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -197,11 +276,11 @@ public:
{ {
#ifdef GRID_HIP #ifdef GRID_HIP
auto err = hipDeviceSynchronize(); auto err = hipDeviceSynchronize();
assert(err==hipSuccess); GRID_ASSERT(err==hipSuccess);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
auto err = cudaDeviceSynchronize(); auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess); GRID_ASSERT(err==cudaSuccess);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
accelerator_barrier(); accelerator_barrier();
@@ -211,6 +290,269 @@ public:
#endif #endif
} }
/////////////////////////////////////////////////////////////
// Single matrix GEMM -- fp64 and fp32
/////////////////////////////////////////////////////////////
void gemm(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // Device pointer
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn)
{
RealD t2=usecond();
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexD> alpha_p(1);
static deviceVector<ComplexD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
RealD t0=usecond();
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasZgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda,
(hipblasDoubleComplex *) Bkn, ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasZgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda,
(cuDoubleComplex *) Bkn, ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
iOpA,
iOpB,
m64,n64,k64,
(ComplexD *) &alpha_p[0],
(const ComplexD *)Amk, (int64_t )lda64,
(const ComplexD *)Bkn, (int64_t )ldb64,
(ComplexD *) &beta_p[0],
(ComplexD *)Cmn, (int64_t)ldc64);
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
}
void gemm(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexF alpha,
ComplexF* Amk, // Device pointer
ComplexF* Bkn,
ComplexF beta,
ComplexF* Cmn)
{
RealD t2=usecond();
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexF> alpha_p(1);
static deviceVector<ComplexF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasCgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex *) Amk, lda,
(hipblasComplex *) Bkn, ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex *) Cmn, ldc);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex *) Amk, lda,
(cuComplex *) Bkn, ldb,
(cuComplex *) &beta_p[0],
(cuComplex *) Cmn, ldc);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
iOpA,
iOpB,
m64,n64,k64,
(ComplexF *) &alpha_p[0],
(const ComplexF *)Amk, (int64_t )lda64,
(const ComplexF *)Bkn, (int64_t )ldb64,
(ComplexF *) &beta_p[0],
(ComplexF *)Cmn, (int64_t )ldc64);
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k;
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
}
/////////////////////////////////////////////////////////////
void gemmBatched(int m,int n, int k, void gemmBatched(int m,int n, int k,
ComplexD alpha, ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
@@ -241,36 +583,6 @@ public:
beta, beta,
Cmn); Cmn);
} }
void gemmBatched(int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(GridBLASOperation_t OpA, void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB, GridBLASOperation_t OpB,
@@ -283,11 +595,11 @@ public:
{ {
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount); GRID_ASSERT(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); GRID_ASSERT(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T); GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -324,7 +636,7 @@ public:
(hipblasDoubleComplex **)&Cmn[0], ldc, (hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl; // std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS); GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -345,7 +657,7 @@ public:
(cuDoubleComplex *) &beta_p[0], (cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc, (cuDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==CUBLAS_STATUS_SUCCESS); GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -492,8 +804,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T); GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -509,8 +821,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond(); RealD t0=usecond();
assert(Bkn.size()==batchCount); GRID_ASSERT(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
@@ -531,7 +843,7 @@ public:
(hipblasComplex **)&Cmn[0], ldc, (hipblasComplex **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS); GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -552,7 +864,7 @@ public:
(cuComplex *) &beta_p[0], (cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc, (cuComplex **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==CUBLAS_STATUS_SUCCESS); GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -624,301 +936,6 @@ public:
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
} }
///////////////////////////////////////////////////////////////////////////
// Single precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealF> alpha_p(1);
static deviceVector<RealF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealD> alpha_p(1);
static deviceVector<RealD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasDgemmBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasDgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(const double **)&Amk[0], (const int64_t *)&lda64,
(const double **)&Bkn[0], (const int64_t *)&ldb64,
(double *) &beta_p[0],
(double **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
template<class CComplex> template<class CComplex>
double benchmark(int M, int N, int K, int BATCH) double benchmark(int M, int N, int K, int BATCH)
{ {
@@ -967,6 +984,47 @@ public:
return flops; // Returns gigaflops return flops; // Returns gigaflops
} }
template<class CComplex>
double benchmark(int M, int N, int K)
{
int32_t N_A = M*K;
int32_t N_B = K*N;
int32_t N_C = M*N;
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
CComplex alpha(1.0);
CComplex beta (1.0);
RealD flops = 8.0*M*N*K;
int ncall=10;
gemm(GridBLAS_OP_C,GridBLAS_OP_N,
M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0]);
synchronise();
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemm(GridBLAS_OP_N,GridBLAS_OP_N,
M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0]);
synchronise();
}
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
flops = 8.0*M*N*K*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
}; };
@@ -1035,6 +1093,21 @@ static void BLAS(void)
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl; std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}} }}
fprintf(FP,"\n\n\n"); fprintf(FP,"\n\n\n");
std::cout << "----------------------------------------------------------"<<std::endl;
std::cout << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
std::cout << "----------------------------------------------------------"<<std::endl;
{
int M=12;
int N=12;
std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
for( int kk=0;kk<ks.size();kk++ ) {
int K = ks[kk];
double p=blas.benchmark<CComplex>(M,N,K);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
}
}
std::cout << "=================================================================================="<<std::endl; std::cout << "=================================================================================="<<std::endl;
}; };
+1 -1
View File
@@ -1,2 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
+5
View File
@@ -0,0 +1,5 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
+2
View File
@@ -0,0 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
+2
View File
@@ -51,11 +51,13 @@ directory
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored #pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress declared_but_not_referenced
#pragma nv_diag_suppress extra_semicolon #pragma nv_diag_suppress extra_semicolon
#else #else
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type #pragma diag_suppress cast_to_qualified_type
#pragma diag_suppress declared_but_not_referenced
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon #pragma diag_suppress extra_semicolon
+39 -4
View File
@@ -1,9 +1,17 @@
#ifndef GRID_STD_H #ifndef GRID_STD_H
#define GRID_STD_H #define GRID_STD_H
///////////////////
// Grid config
///////////////////
#include "Config.h"
/////////////////// ///////////////////
// Std C++ dependencies // Std C++ dependencies
/////////////////// ///////////////////
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory> #include <memory>
@@ -15,7 +23,9 @@
#include <random> #include <random>
#include <functional> #include <functional>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <strings.h> #include <strings.h>
#include <stdio.h> #include <stdio.h>
#include <signal.h> #include <signal.h>
@@ -23,11 +33,36 @@
#include <sys/time.h> #include <sys/time.h>
#include <chrono> #include <chrono>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
void GridAbort(void);
#define ASSLOG(A) ::write(STDERR_FILENO,A,::strlen(A));
#ifdef HAVE_EXECINFO_H
#define GRID_ASSERT(b) if(!(b)) { \
fflush(stdout); \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE); \
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,STDERR_FILENO); \
GridAbort(); \
};
#else
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
GridAbort(); \
};
#endif
///////////////////
// Grid config
///////////////////
#include "Config.h"
#ifdef TOFU #ifdef TOFU
#undef GRID_COMMS_THREADS #undef GRID_COMMS_THREADS
+4
View File
@@ -54,6 +54,7 @@ Version.h: version-cache
include Make.inc include Make.inc
include Eigen.inc include Eigen.inc
if BUILD_FERMION_INSTANTIATIONS
extra_sources+=$(WILS_FERMION_FILES) extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES) extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS if BUILD_ZMOBIUS
@@ -68,8 +69,11 @@ if BUILD_FERMION_REPS
endif endif
if BUILD_SP if BUILD_SP
extra_sources+=$(SP_FERMION_FILES) extra_sources+=$(SP_FERMION_FILES)
if BUILD_FERMION_REPS
extra_sources+=$(SP_TWOIND_FERMION_FILES) extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif endif
endif
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a
+1 -1
View File
@@ -29,8 +29,8 @@ directory
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include <cassert>
#include <exception> #include <exception>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) } #define NAMESPACE_END(A) }
+9
View File
@@ -50,6 +50,9 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h> #include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h> #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h> #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
// Not really deflation, but useful
#include <Grid/algorithms/blas/MomentumProject.h>
NAMESPACE_CHECK(deflation); NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad); NAMESPACE_CHECK(ConjGrad);
@@ -72,6 +75,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h> #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h> #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/SimpleLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/iterative/AdefGeneric.h> #include <Grid/algorithms/iterative/AdefGeneric.h>
#include <Grid/algorithms/iterative/AdefMrhs.h> #include <Grid/algorithms/iterative/AdefMrhs.h>
@@ -80,4 +84,9 @@ NAMESPACE_CHECK(PowerMethod);
NAMESPACE_CHECK(multigrid); NAMESPACE_CHECK(multigrid);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#include <Grid/algorithms/iterative/KrylovSchur.h>
#include <Grid/algorithms/iterative/Arnoldi.h>
#include <Grid/algorithms/iterative/LanczosBidiagonalization.h>
#include <Grid/algorithms/iterative/RestartedLanczosBidiagonalization.h>
#endif #endif
+320 -118
View File
@@ -28,6 +28,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_FFT_H_ #ifndef _GRID_FFT_H_
#define _GRID_FFT_H_ #define _GRID_FFT_H_
#ifdef GRID_CUDA
#include <cufft.h>
#endif
#ifdef GRID_HIP
#include <hipfft/hipfft.h>
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
#if defined(USE_MKL) || defined(GRID_SYCL) #if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h> #include <fftw/fftw3.h>
@@ -35,88 +44,190 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <fftw3.h> #include <fftw3.h>
#endif #endif
#endif #endif
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; #ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#define FFTW_ESTIMATE (0)
#endif
template<class scalar> struct FFTW {
};
#ifdef GRID_HIP
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftDoubleComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
#endif
#ifdef GRID_CUDA
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftDoubleComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
else cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
else cufftExecC2C(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> { template<> struct FFTW<ComplexD> {
public: public:
typedef fftw_complex FFTW_scalar; typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan; typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, FFTW_scalar *in, int *inembed,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out); ::fftw_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p); ::fftw_destroy_plan(p);
} }
}; };
template<> struct FFTW<ComplexF> { template<> struct FFTW<ComplexF> {
public: public:
typedef fftwf_complex FFTW_scalar; typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan; typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, FFTW_scalar *in, int *inembed,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out); ::fftwf_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p); ::fftwf_destroy_plan(p);
} }
}; };
#endif #endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif #endif
class FFT { class FFT {
private: private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops; double flops;
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public: public:
static const int forward=FFTW_FORWARD; static const int forward=FFTW_FORWARD;
@@ -126,31 +237,25 @@ public:
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;} double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) : FFT ( GridCartesian * grid )
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{ {
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
}; };
~FFT ( void) { ~FFT ( void) {
delete sgrid; // delete sgrid;
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid); // vgrid=result.Grid();
conformable(source.Grid(),vgrid); // conformable(result.Grid(),vgrid);
Lattice<vobj> tmp(vgrid); // conformable(source.Grid(),vgrid);
tmp = source; const int Ndim = source.Grid()->Nd();
for(int d=0;d<Nd;d++){ Lattice<vobj> tmp = source;
for(int d=0;d<Ndim;d++){
if( mask[d] ) { if( mask[d] ) {
FFT_dim(result,tmp,d,sign); FFT_dim(result,tmp,d,sign);
tmp=result; tmp=result;
@@ -160,59 +265,70 @@ public:
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1); const int Ndim = source.Grid()->Nd();
Coordinate mask(Ndim,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW const int Ndim = source.Grid()->Nd();
assert(0); GridBase *grid = source.Grid();
#else conformable(result.Grid(),source.Grid());
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim]; int L = grid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = grid->_fdimensions[dim];
Coordinate layout(Nd,1); Coordinate layout(Ndim,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar; typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
Lattice<sobj> pgbuf(&pencil_g); //std::cout << "CPU view" << std::endl;
autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar); int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1; int64_t Nlow = 1;
int64_t Nhigh = 1;
for(int d=0;d<dim;d++){ for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d]; Nlow*=grid->_ldimensions[d];
} }
for(int d=dim+1;d<Ndim;d++){
Nhigh*=grid->_ldimensions[d];
}
int64_t Nperp=Nlow*Nhigh;
deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
pgbuf.resize(Nperp*Ncomp*G);
scalar *pgbuf_v = &pgbuf[0];
int rank = 1; /* 1d transforms */ int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */ int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp; int howmany = Ncomp * Nperp;
int odist,idist,istride,ostride; int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */ idist = odist = G; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ istride = ostride = 1; /* Distance between two elements in the same FT */
int *inembed = n, *onembed = n; int *inembed = n, *onembed = n;
scalar div; scalar div;
if ( sign == backward ) div = 1.0/G; if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else assert(0); else GRID_ASSERT(0);
double t_pencil=0;
double t_fft =0;
double t_total =-usecond();
// std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
/*
*
*/
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -226,68 +342,154 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
Coordinate lcoor(Nd), gcoor(Nd); // std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Ndim), gcoor(Ndim);
double t_copy=0;
double t_shift=0;
t_pencil = -usecond();
result = source; result = source;
int pc = processor_coor[dim]; int pc = grid->_processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
{ const Coordinate ldims = grid->_ldimensions;
autoView(r_v,result,CpuRead); const Coordinate rdims = grid->_rdimensions;
autoView(p_v,pgbuf,CpuWrite); const Coordinate sdims = grid->_simd_layout;
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate processors = grid->_processors;
sobj s; Coordinate pgdims(Ndim);
sgrid->LocalIndexToLocalCoor(idx,cbuf); pgdims[0] = G;
peekLocalSite(s,r_v,cbuf); for(int d=0, dd=1;d<Ndim;d++){
cbuf[dim]+=((pc+p) % processors[dim])*L; if ( d!=dim ) pgdims[dd++] = ldims[d];
pokeLocalSite(s,p_v,cbuf);
});
} }
if (p != processors[dim] - 1) { int64_t pgvol=1;
result = Cshift(result,dim,L); for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
const int Nsimd = vobj::Nsimd();
for(int p=0;p<processors[dim];p++) {
t_copy-=usecond();
autoView(r_v,result,AcceleratorRead);
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
Coordinate icoor;
Coordinate ocoor;
Coordinate pgcoor;
Lexicographic::CoorFromIndex(icoor,lane,sdims);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
} }
} }
// Loop over orthog coords // Map coordinates in lattice layout to FFTW index
int NN=pencil_g.lSites(); int64_t pgidx;
GridStopWatch timer; Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
timer.Start();
thread_for( idx,NN,{ vector_type *from = (vector_type *)&r_v[idx];
Coordinate cbuf(Nd); scalar_type stmp;
pencil_g.LocalIndexToLocalCoor(idx, cbuf); for(int w=0;w<Ncomp;w++){
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 int64_t pg_idx = pgidx + w*pgvol;
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx]; stmp = getlane(from[w], lane);
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx]; pgbuf_v[pg_idx] = stmp;
FFTW<scalar>::fftw_execute_dft(p,in,out);
} }
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
timer.Stop();
t_copy+=usecond();
if (p != processors[dim] - 1) {
Lattice<vobj> temp(grid);
t_shift-=usecond();
temp = Cshift(result,dim,L); result = temp;
t_shift+=usecond();
}
}
t_pencil += usecond();
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v;
FFTW_scalar *out= (FFTW_scalar *)pgbuf_v;
t_fft = -usecond();
FFTW<scalar>::fftw_execute_dft(p,in,out,sign);
t_fft += usecond();
// performance counting // performance counting
double add,mul,fma; flops_call = 5.0*howmany*G*log2(G);
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma); usec = t_fft;
flops_call = add+mul+2.0*fma; flops= flops_call;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result result = Zero();
double t_insert = -usecond();
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(r_v,result,AcceleratorWrite);
autoView(result_v,result,CpuWrite); accelerator_for(idx,grid->oSites(),Nsimd,{
thread_for(idx,sgrid->lSites(),{ #ifdef GRID_SIMT
Coordinate clbuf(Nd), cgbuf(Nd); {
sobj s; int lane=acceleratorSIMTlane(Nsimd); // buffer lane
sgrid->LocalIndexToLocalCoor(idx,clbuf); #else
cgbuf = clbuf; for(int lane=0;lane<Nsimd;lane++) {
cgbuf[dim] = clbuf[dim]+L*pc; #endif
peekLocalSite(s,pgbuf_v,cgbuf); Coordinate icoor(Ndim);
pokeLocalSite(s,result_v,clbuf); Coordinate ocoor(Ndim);
Coordinate pgcoor(Ndim);
Lexicographic::CoorFromIndex(icoor,lane,sdims);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
}
}
// Map coordinates in lattice layout to FFTW index
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
vector_type *to = (vector_type *)&r_v[idx];
scalar_type stmp;
for(int w=0;w<Ncomp;w++){
int64_t pg_idx = pgidx + w*pgvol;
stmp = pgbuf_v[pg_idx];
putlane(to[w], stmp, lane);
}
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
} }
result = result*div; result = result*div;
t_insert +=usecond();
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif
t_total +=usecond();
std::cout <<GridLogPerformance<< " FFT took "<<t_total/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT insert "<<t_insert/1.0e6 <<" s" << std::endl;
} }
}; };
+87 -23
View File
@@ -64,7 +64,7 @@ public:
// //
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to // with an GRID_ASSERT trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes // do it, but I fear it required multiple inheritance and mixed in abstract base classes
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -103,6 +103,38 @@ public:
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
} }
}; };
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother // Construct herm op and shift it for mgrid smoother
@@ -116,22 +148,22 @@ public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
assert(0); GRID_ASSERT(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
assert(0); GRID_ASSERT(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0); GRID_ASSERT(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
assert(0); GRID_ASSERT(0);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
assert(0); GRID_ASSERT(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out); HermOp(in,out);
@@ -156,13 +188,13 @@ public:
ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){}; ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
assert(0); GRID_ASSERT(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0); GRID_ASSERT(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0); GRID_ASSERT(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
HermOp(in,out); HermOp(in,out);
@@ -239,10 +271,42 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0); GRID_ASSERT(0);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
assert(0); GRID_ASSERT(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GRID_ASSERT(0);
}
void HermOp(const Field &in, Field &out){
GRID_ASSERT(0);
} }
}; };
@@ -281,13 +345,13 @@ class SchurOperatorBase : public LinearOperatorBase<Field> {
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system GRID_ASSERT(0); // must coarsen the unpreconditioned system
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0); GRID_ASSERT(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0); GRID_ASSERT(0);
}; };
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
@@ -383,10 +447,10 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0); GRID_ASSERT(0);
} }
virtual void HermOp(const Field& in, Field& out) { virtual void HermOp(const Field& in, Field& out) {
assert(0); GRID_ASSERT(0);
} }
void Op(const Field& in, Field& out) { void Op(const Field& in, Field& out) {
Mpc(in, out); Mpc(in, out);
@@ -396,13 +460,13 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) { void OpDiag(const Field& in, Field& out) {
assert(0); // must coarsen the unpreconditioned system GRID_ASSERT(0); // must coarsen the unpreconditioned system
} }
void OpDir(const Field& in, Field& out, int dir, int disp) { void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0); GRID_ASSERT(0);
} }
void OpDirAll(const Field& in, std::vector<Field>& out){ void OpDirAll(const Field& in, std::vector<Field>& out){
assert(0); GRID_ASSERT(0);
}; };
}; };
@@ -516,7 +580,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
public: public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
assert( _Mat.isTrivialEE() ); GRID_ASSERT( _Mat.isTrivialEE() );
mass = _Mat.Mass(); mass = _Mat.Mass();
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -547,7 +611,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out) { virtual void MpcDagMpc(const Field &in, Field &out) {
assert(0);// Never need with staggered GRID_ASSERT(0);// Never need with staggered
} }
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
@@ -559,7 +623,7 @@ template<class Field> class OperatorFunction {
public: public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0; virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) { virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size()); GRID_ASSERT(in.size()==out.size());
for(int k=0;k<in.size();k++){ for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
@@ -573,7 +637,7 @@ public:
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out) virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{ {
assert(in.size() == out.size()); GRID_ASSERT(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i) for (unsigned int i = 0; i < in.size(); ++i)
{ {
+5
View File
@@ -45,6 +45,11 @@ public:
M(in,tmp); M(in,tmp);
Mdag(tmp,out); Mdag(tmp,out);
} }
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
+23 -1
View File
@@ -59,7 +59,7 @@ public:
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = diff*1.0e-9; RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1; delta*=1.02;
RealD f = approx(x); RealD f = approx(x);
out<< x<<" "<<f<<std::endl; out<< x<<" "<<f<<std::endl;
} }
@@ -131,6 +131,26 @@ public:
Coeffs[j] = s * 2.0/order; Coeffs[j] = s * 2.0/order;
} }
}; };
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){ void JacksonSmooth(void){
@@ -249,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in); axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;
+3 -3
View File
@@ -121,7 +121,7 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
// Reallocate arrays, since degree has changed // Reallocate arrays, since degree has changed
if (num_degree != n || den_degree != d) allocate(num_degree,den_degree); if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
assert(a_len<=SUM_MAX); GRID_ASSERT(a_len<=SUM_MAX);
step = new bigfloat[num_degree+den_degree+2]; step = new bigfloat[num_degree+den_degree+2];
@@ -151,9 +151,9 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Delta too small, try increasing precision\n"; std::cout<<"Delta too small, try increasing precision\n";
assert(0); GRID_ASSERT(0);
}; };
assert( delta>= tolerance); GRID_ASSERT( delta>= tolerance);
search(step); search(step);
} }
+1 -1
View File
@@ -134,7 +134,7 @@ class AlgRemez
virtual ~AlgRemez(); virtual ~AlgRemez();
int getDegree(void){ int getDegree(void){
assert(n==d); GRID_ASSERT(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation
+8 -8
View File
@@ -28,11 +28,11 @@ void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyTy
pow_n = num_degree; pow_n = num_degree;
pow_d = den_degree; pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0); if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) GRID_ASSERT(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0); if(pow_n % 2 == 1 && num_type_in == PolyType::Even) GRID_ASSERT(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0); if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) GRID_ASSERT(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0); if(pow_d % 2 == 1 && den_type_in == PolyType::Even) GRID_ASSERT(0);
num_type = num_type_in; num_type = num_type_in;
den_type = den_type_in; den_type = den_type_in;
@@ -112,9 +112,9 @@ double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degre
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n"; std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
assert(0); GRID_ASSERT(0);
}; };
assert( delta>= tolerance ); GRID_ASSERT( delta>= tolerance );
search(); search();
} }
@@ -278,7 +278,7 @@ void AlgRemezGeneral::equations(){
if(num_pows[j] != -1){ *aa++ = z; t++; } if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x; z *= x;
} }
assert(t == n+1); GRID_ASSERT(t == n+1);
z = (bigfloat)1l; z = (bigfloat)1l;
t = 0; t = 0;
@@ -286,7 +286,7 @@ void AlgRemezGeneral::equations(){
if(den_pows[j] != -1){ *aa++ = -y * z; t++; } if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x; z *= x;
} }
assert(t == d); GRID_ASSERT(t == d);
B[i] = y * z; // Right hand side vector B[i] = y * z; // Right hand side vector
} }
+1 -1
View File
@@ -106,7 +106,7 @@ class AlgRemezGeneral{
bigfloat (*f)(bigfloat x, void *data), void *data); bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{ inline int getDegree(void) const{
assert(n==d); GRID_ASSERT(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation
+1 -1
View File
@@ -74,7 +74,7 @@ bigfloat epsilonMobius(bigfloat x, void* data){
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in, const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){ const RealD lambda_bound){
assert(omega_in.size() == Ls_in); GRID_ASSERT(omega_in.size() == Ls_in);
omega_out.resize(Ls_out); omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial //Use the Remez algorithm to generate the appropriate rational polynomial
+572 -35
View File
@@ -28,6 +28,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
#pragma once #pragma once
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hip/hip_version.h>
#include <hipblas/hipblas.h> #include <hipblas/hipblas.h>
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@@ -55,16 +56,17 @@ NAMESPACE_BEGIN(Grid);
typedef cublasHandle_t gridblasHandle_t; typedef cublasHandle_t gridblasHandle_t;
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t; typedef int32_t gridblasHandle_t;
#endif #endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
class GridBLAS { class GridBLAS {
public: public:
@@ -89,15 +91,30 @@ public:
gridblasHandle = theGridAccelerator; gridblasHandle = theGridAccelerator;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
cl::sycl::gpu_selector selector; sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector }; sycl::device selectedDevice { selector };
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; sycl::property_list q_prop{sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop); gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif #endif
gridblasInit=1; gridblasInit=1;
} }
} }
#ifdef GRID_CUDA
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
switch (p) {
case GridBLAS_PRECISION_16F:
return CUBLAS_COMPUTE_32F_FAST_16F;
case GridBLAS_PRECISION_16BF:
return CUBLAS_COMPUTE_32F_FAST_16BF;
case GridBLAS_PRECISION_TF32:
return CUBLAS_COMPUTE_32F_FAST_TF32;
default:
GRID_ASSERT(0);
}
return CUBLAS_COMPUTE_32F_FAST_16F;
}
#endif
// Force construct once // Force construct once
GridBLAS() { Init(); }; GridBLAS() { Init(); };
~GridBLAS() { }; ~GridBLAS() { };
@@ -119,11 +136,11 @@ public:
{ {
#ifdef GRID_HIP #ifdef GRID_HIP
auto err = hipDeviceSynchronize(); auto err = hipDeviceSynchronize();
assert(err==hipSuccess); GRID_ASSERT(err==hipSuccess);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
auto err = cudaDeviceSynchronize(); auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess); GRID_ASSERT(err==cudaSuccess);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
accelerator_barrier(); accelerator_barrier();
@@ -138,8 +155,10 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn, deviceVector<ComplexD*> &Bkn,
ComplexD beta, ComplexD beta,
deviceVector<ComplexD*> &Cmn) deviceVector<ComplexD*> &Cmn,
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k, m,n,k,
alpha, alpha,
@@ -201,15 +220,17 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn, deviceVector<ComplexD*> &Bkn,
ComplexD beta, ComplexD beta,
deviceVector<ComplexD*> &Cmn) deviceVector<ComplexD*> &Cmn,
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount); GRID_ASSERT(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); GRID_ASSERT(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T); //assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -235,6 +256,18 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipDoubleComplex *) &alpha_p[0],
(hipDoubleComplex **)&Amk[0], lda,
(hipDoubleComplex **)&Bkn[0], ldb,
(hipDoubleComplex *) &beta_p[0],
(hipDoubleComplex **)&Cmn[0], ldc,
batchCount);
#else
auto err = hipblasZgemmBatched(gridblasHandle, auto err = hipblasZgemmBatched(gridblasHandle,
hOpA, hOpA,
hOpB, hOpB,
@@ -245,8 +278,9 @@ public:
(hipblasDoubleComplex *) &beta_p[0], (hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc, (hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
#endif
// std::cout << " hipblas return code " <<(int)err<<std::endl; // std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS); GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -267,7 +301,7 @@ public:
(cuDoubleComplex *) &beta_p[0], (cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc, (cuDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==CUBLAS_STATUS_SUCCESS); GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -367,28 +401,67 @@ public:
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@@ -409,13 +482,14 @@ public:
deviceVector<ComplexF*> &Amk, // pointer list to matrices deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn, deviceVector<ComplexF*> &Bkn,
ComplexF beta, ComplexF beta,
deviceVector<ComplexF*> &Cmn) deviceVector<ComplexF*> &Cmn,
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T); //assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -431,9 +505,10 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond(); RealD t0=usecond();
assert(Bkn.size()==batchCount); GRID_ASSERT(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -442,6 +517,18 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipComplex *) &alpha_p[0],
(hipComplex **)&Amk[0], lda,
(hipComplex **)&Bkn[0], ldb,
(hipComplex *) &beta_p[0],
(hipComplex **)&Cmn[0], ldc,
batchCount);
#else
auto err = hipblasCgemmBatched(gridblasHandle, auto err = hipblasCgemmBatched(gridblasHandle,
hOpA, hOpA,
hOpB, hOpB,
@@ -453,7 +540,8 @@ public:
(hipblasComplex **)&Cmn[0], ldc, (hipblasComplex **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS); #endif
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -464,7 +552,9 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemmBatched(gridblasHandle, cublasStatus_t err;
if (precision == GridBLAS_PRECISION_DEFAULT) {
err = cublasCgemmBatched(gridblasHandle,
hOpA, hOpA,
hOpB, hOpB,
m,n,k, m,n,k,
@@ -474,9 +564,23 @@ public:
(cuComplex *) &beta_p[0], (cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc, (cuComplex **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==CUBLAS_STATUS_SUCCESS); } else {
cublasComputeType_t compute_precision = toDataType(precision);
err = cublasGemmBatchedEx(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(void *) &alpha_p[0],
(void **)&Amk[0], CUDA_C_32F, lda,
(void **)&Bkn[0], CUDA_C_32F, ldb,
(void *) &beta_p[0],
(void **)&Cmn[0], CUDA_C_32F, ldc,
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
}
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
int64_t m64=m; int64_t m64=m;
int64_t n64=n; int64_t n64=n;
int64_t k64=k; int64_t k64=k;
@@ -508,34 +612,77 @@ public:
synchronise(); synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
// Need a default/reference implementation; use Eigen // Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@@ -562,8 +709,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C); GRID_ASSERT(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -579,8 +726,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond(); RealD t0=usecond();
assert(Bkn.size()==batchCount); GRID_ASSERT(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
@@ -600,7 +747,7 @@ public:
(float *) &beta_p[0], (float *) &beta_p[0],
(float **)&Cmn[0], ldc, (float **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS); GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -621,7 +768,7 @@ public:
(float *) &beta_p[0], (float *) &beta_p[0],
(float **)&Cmn[0], ldc, (float **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==CUBLAS_STATUS_SUCCESS); GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -661,28 +808,40 @@ public:
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
}); });
} else { } else {
assert(0); assert(0);
@@ -709,8 +868,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C); GRID_ASSERT(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -727,8 +886,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond(); RealD t0=usecond();
assert(Bkn.size()==batchCount); GRID_ASSERT(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); GRID_ASSERT(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
@@ -748,7 +907,7 @@ public:
(double *) &beta_p[0], (double *) &beta_p[0],
(double **)&Cmn[0], ldc, (double **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS); GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -769,7 +928,7 @@ public:
(double *) &beta_p[0], (double *) &beta_p[0],
(double **)&Cmn[0], ldc, (double **)&Cmn[0], ldc,
batchCount); batchCount);
assert(err==CUBLAS_STATUS_SUCCESS); GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -809,28 +968,40 @@ public:
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
}); });
} else { } else {
assert(0); assert(0);
@@ -841,6 +1012,372 @@ public:
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
} }
/*
Inverse and Determinant
- CPU version uses Eigen
- GPU version uses LAPACK-compatible getrf / getri
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
Overhead to go through getrf / getri for CPU version too large.
Current interface therefore only guarantees the inverse and determinant
functions on all platforms but not the getrf / getri ones.
*/
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
void inverseBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &Cnn) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void inverseBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &Cnn) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &C) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &C) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
#else
#ifdef GRID_SYCL
template<typename T>
void getrfBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
&n, &n,
(T **)&Ann[0],
&n,
(int64_t**)&_ipiv[0],
(int64_t)1, &batchCount,
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
}
#endif
void getrfBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
(hipDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#endif
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
void getrfBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
(hipComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
#endif
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
#ifdef GRID_SYCL
template<typename T>
void getriBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<T*> &Cnn) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
&n,
(T **)&Ann[0],
&n,
(int64_t**)p_ipiv,
(int64_t)1, &batchCount,
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
T** pA = &Ann[0];
T** pC = &Cnn[0];
accelerator_for(i, batchCount*n*n, 1, {
auto j = i / batchCount;
auto k = i % batchCount;
pC[k][j] = pA[k][j];
});
}
#endif
void getriBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexD*> &Cnn)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
GRID_ASSERT(Cnn.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
(hipDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#endif
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
void getriBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexF*> &Cnn)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
GRID_ASSERT(Cnn.size()==batchCount);
#ifdef GRID_HIP
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
(hipComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#else
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
#endif
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
template<typename dtype>
void inverseBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
) {
int64_t batchCount = Ann.size();
RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
// test info for non-invertibility? set to nan if yes?
getriBatched(n, Ann, ipiv, info, Cnn);
//synchronise();
//RealD t2 = usecond();
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
}
template<typename dtype>
void determinantBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &C // this will be overwritten with determinant
) {
int64_t batchCount = Ann.size();
//RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
dtype** pAnn = (dtype**)&Ann[0];
dtype** pC = (dtype**)&C[0];
#if defined(GRID_CUDA) || defined(GRID_HIP)
int* pipiv = (int*)&ipiv[0];
#else
int64_t* pipiv = (int64_t*)&ipiv[0];
#endif
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
//RealD t2 = usecond();
accelerator_for(i,batchCount,1,{
dtype det = 1.0;
for (int64_t j=0;j<n;j++) {
det *= pAnn[i][n*j + j];
// branchless signs
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
}
*pC[i] = det;
});
//RealD t3 = usecond();
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
}
#endif
template<class CComplex> template<class CComplex>
double benchmark(int M, int N, int K, int BATCH) double benchmark(int M, int N, int K, int BATCH)
{ {
+300
View File
@@ -0,0 +1,300 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MomentumProject.h
Copyright (C) 2025
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiMomProject
Import vectors -> nxyz x (ncomponent x nt)
Import complex phases -> nmom x nxy
apply = via (possibly batched) GEMM
*/
template<class Field, class ComplexField>
class MomentumProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
GridBase *grid;
uint64_t nmom;
uint64_t nxyz;
uint64_t nt;
uint64_t nbtw;
uint64_t words;
deviceVector<scalar> BLAS_V; //
deviceVector<scalar> BLAS_M; //
deviceVector<scalar> BLAS_P; //
MomentumProject(){};
~MomentumProject(){ Deallocate(); };
void Deallocate(void)
{
grid=nullptr;
nmom=0;
nxyz=0;
nt=0;
nbtw=0;
words=0;
BLAS_V.resize(0);
BLAS_M.resize(0);
BLAS_P.resize(0);
}
void Allocate(int _nmom,GridBase *_grid)
{
grid=_grid;
Coordinate ldims = grid->LocalDimensions();
nmom=_nmom;
nt = ldims[grid->Nd()-1];
nxyz = grid->lSites()/nt;
words = sizeof(scalar_object)/sizeof(scalar);
nbtw = nt * words;
BLAS_V.resize (nxyz * nt * words );
BLAS_M.resize (nmom * nxyz );
BLAS_P.resize (nmom * nt * words );
}
void ImportMomenta(const std::vector <ComplexField> &momenta)
{
GRID_ASSERT(momenta.size()==nmom);
// might as well just make the momenta here
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_M.size();
GRID_ASSERT(momenta.size()==nmom)
GRID_ASSERT(momenta[0].Grid()==grid);
GRID_ASSERT(sz = nxyz * nmom);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims = grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords = words; // local variable for copy in to GPU
int64_t Nxyz = nxyz;
auto blasData_p = &BLAS_M[0];
for(int m=0;m<momenta.size();m++){
autoView( Data , momenta[m], AcceleratorRead);
auto Data_p = &Data[0];
accelerator_for(xyz,nxyz,1,{
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Lexicographic::CoorFromIndex(lcoor,xyz,ldims);
Coordinate icoor(nd);
Coordinate ocoor(nd);
for (int d = 0; d < nd; d++) {
icoor[d] = lcoor[d]/rdimensions[d];
ocoor[d] = lcoor[d]%rdimensions[d];
}
int64_t osite;
int64_t isite;
Lexicographic::IndexFromCoor(ocoor,osite,rdimensions);
Lexicographic::IndexFromCoor(icoor,isite,simd);
// BLAS_M[nmom][slice_vol]
// Fortran Column major BLAS layout is M_xyz,mom
scalar data = extractLane(isite,Data[osite]);
uint64_t idx = xyz+m*Nxyz;
blasData_p[idx] = data;
});
}
}
void ImportVector(Field &vec)
{
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_V.size();
GRID_ASSERT(sz = nxyz * words * nt);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims= grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords= words; // local variable for copy in to GPU
auto blasData_p = &BLAS_V[0];
autoView( Data , vec, AcceleratorRead);
auto Data_p = &Data[0];
int64_t nwords = words;// for capture
int64_t Nt = nt;// for capture
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Coordinate icoor(nd);
Coordinate ocoor(nd);
Lexicographic::CoorFromIndex(icoor,lane,simd);
Lexicographic::CoorFromIndex(ocoor,sf,rdimensions);
int64_t l_xyz = 0;
for (int d = 0; d < nd; d++) {
lcoor[d] = rdimensions[d]*icoor[d] + ocoor[d];
}
uint64_t l_t = lcoor[nd-1];
Coordinate xyz_coor = lcoor;
xyz_coor[nd-1] =0;
Lexicographic::IndexFromCoor(xyz_coor,l_xyz,ldims);
scalar_object data = extractLane(lane,Data[sf]);
scalar *data_words = (scalar *) &data;
for(int w = 0 ; w < nwords; w++) {
// BLAS_V[slice_vol][nt][words]
// Fortran Column major BLAS layout is V_(t,w)_xyz
uint64_t idx = w+l_t*nwords + l_xyz * nwords * Nt;
blasData_p[idx] = data_words[w];
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
void ExportMomentumProjection(std::vector<typename Field::scalar_object> &projection)
{
projection.resize(nmom*nt);
acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar));
// Could decide on a layout late?
}
// Row major layout "C" order:
// BLAS_V[slice_vol][nt][words]
// BLAS_M[nmom][slice_vol]
// BLAS_P[nmom][nt][words]
//
// Fortran Column major BLAS layout is V_(w,t)_xyz
// Fortran Column major BLAS layout is M_xyz,mom
// Fortran Column major BLAS layout is P_(w,t),mom
//
// Projected
//
// P = (V * M)_(w,t),mom
//
void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata)
{
double t_import=0;
double t_export=0;
double t_gemm =0;
double t_allreduce=0;
t_import-=usecond();
this->ImportVector(data);
std::vector< typename Field::scalar_object > projected_planes;
deviceVector<scalar *> Vd(1);
deviceVector<scalar *> Md(1);
deviceVector<scalar *> Pd(1);
scalar * Vh = & BLAS_V[0];
scalar * Mh = & BLAS_M[0];
scalar * Ph = & BLAS_P[0];
acceleratorPut(Vd[0],Vh);
acceleratorPut(Md[0],Mh);
acceleratorPut(Pd[0],Ph);
t_import+=usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// P_im = VMmx . Vxi
/////////////////////////////////////////
t_gemm-=usecond();
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
words*nt,nmom,nxyz,
scalar(1.0),
Vd,
Md,
scalar(0.0), // wipe out result
Pd);
BLAS.synchronise();
t_gemm+=usecond();
t_export-=usecond();
ExportMomentumProjection(projected_planes); // resizes
t_export+=usecond();
/////////////////////////////////
// Reduce across MPI ranks
/////////////////////////////////
int nd = grid->Nd();
int gt = grid->GlobalDimensions()[nd-1];
int lt = grid->LocalDimensions()[nd-1];
projected_gdata.resize(gt*nmom);
for(int t=0;t<gt*nmom;t++){ // global Nt array with zeroes for stuff not on this node
projected_gdata[t]=Zero();
}
for(int t=0;t<lt;t++){
for(int m=0;m<nmom;m++){
int st = grid->LocalStarts()[nd-1];
projected_gdata[t+st + gt*m] = projected_planes[t+lt*m];
}}
t_allreduce-=usecond();
grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words);
t_allreduce+=usecond();
std::cout << GridLogPerformance<<" MomentumProject t_import "<<t_import<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_export "<<t_export<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_gemm "<<t_gemm<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_reduce "<<t_allreduce<<"us"<<std::endl;
}
};
NAMESPACE_END(Grid);
+3 -4
View File
@@ -69,8 +69,8 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N) DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N) : evec(_evec), eval(_eval), N(_N)
{ {
assert(evec.size()==eval.size()); GRID_ASSERT(evec.size()==eval.size());
assert(N <= evec.size()); GRID_ASSERT(N <= evec.size());
} }
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
@@ -141,8 +141,7 @@ public:
} }
//postprocessing //postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl; std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++) for (int j=0;j<Nsrc;j++) {
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl; std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace); blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard(); guess[j].Checkerboard() = src[j].Checkerboard();
@@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
GRID_ASSERT(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);
@@ -131,12 +131,12 @@ public:
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl; // std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
assert(vecs[0].Grid()==fine_grid); GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@@ -164,7 +164,7 @@ public:
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl; // std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl; // std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words); GRID_ASSERT(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{ accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT #ifdef GRID_SIMT
@@ -198,7 +198,7 @@ public:
+ v*bv + v*bv
+ sb; + sb;
// assert(site*lwords<sz); // GRID_ASSERT(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords]; scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
@@ -219,12 +219,12 @@ public:
int nvec = vecs.size(); int nvec = vecs.size();
assert(vecs[0].Grid()==fine_grid); GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
assert(block_vol == fine_grid->oSites() / coarse_grid->oSites()); GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@@ -299,7 +299,7 @@ public:
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl; // std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid); GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
@@ -320,7 +320,7 @@ public:
// loop over fine sites // loop over fine sites
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis); GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{ accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT #ifdef GRID_SIMT
@@ -353,7 +353,7 @@ public:
typedef typename vobj::scalar_object coarse_scalar_object; typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl; // std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
assert(vecs[0].Grid()==coarse_grid); GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
@@ -375,7 +375,7 @@ public:
// loop over fine sites // loop over fine sites
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
assert(cwords==nbasis); GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{ accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... }); // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
@@ -409,7 +409,7 @@ public:
int nrhs=fine.size(); int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl; // std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
assert(nbasis==_nbasis); GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs ); BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs ); BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -447,10 +447,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw, nbasis,nrhs,vw,
ComplexD(1.0), scalar(1.0),
Vd, Vd,
Fd, Fd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl; // std::cout << "BlockProject done"<<std::endl;
@@ -464,7 +464,7 @@ public:
{ {
int nrhs=fine.size(); int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
assert(nbasis==_nbasis); GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs ); BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs ); BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -497,10 +497,10 @@ public:
int64_t vw = block_vol * words; int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis, vw,nrhs,nbasis,
ComplexD(1.0), scalar(1.0),
Vd, Vd,
Cd, Cd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Fd); Fd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << " blas call done"<<std::endl; // std::cout << " blas call done"<<std::endl;
@@ -98,7 +98,7 @@ public:
void ImportEigenVector(Field &evec,RealD &_eval, int ev) void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{ {
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl; // std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
assert(ev<eval.size()); GRID_ASSERT(ev<eval.size());
eval[ev] = _eval; eval[ev] = _eval;
int64_t offset = ev*vol*words; int64_t offset = ev*vol*words;
@@ -113,7 +113,7 @@ public:
// Could use to import a batch of eigenvectors // Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev) void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{ {
assert(_ev0+_nev<=evec.size()); GRID_ASSERT(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid()); Allocate(_nev,evec[0].Grid());
@@ -126,8 +126,8 @@ public:
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess) void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{ {
int nrhs = source.size(); int nrhs = source.size();
assert(source.size()==guess.size()); GRID_ASSERT(source.size()==guess.size());
assert(grid == guess[0].Grid()); GRID_ASSERT(grid == guess[0].Grid());
conformable(guess[0],source[0]); conformable(guess[0],source[0]);
int64_t vw = vol * words; int64_t vw = vol * words;
@@ -182,14 +182,14 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw, nev,nrhs,vw,
ComplexD(1.0), scalar(1.0),
Ed, Ed,
Rd, Rd,
ComplexD(0.0), // wipe out C scalar(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
assert(BLAS_C.size()==nev*nrhs); GRID_ASSERT(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar)); acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
@@ -210,10 +210,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev, vw,nrhs,nev,
ComplexD(1.0), scalar(1.0),
Ed, // x . nev Ed, // x . nev
Cd, // nev . nrhs Cd, // nev . nrhs
ComplexD(0.0), scalar(0.0),
Gd); Gd);
BLAS.synchronise(); BLAS.synchronise();
+1 -1
View File
@@ -270,7 +270,7 @@ class TwoLevelCG : public LinearFunction<Field>
std::vector<RealD> src_nrm(nrhs); std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]); src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0); GRID_ASSERT(src_nrm[rhs]!=0.0);
} }
std::vector<RealD> tn(nrhs); std::vector<RealD> tn(nrhs);
+324 -4
View File
@@ -53,6 +53,7 @@ class TwoLevelCGmrhs
// Fine operator, Smoother, CoarseSolver // Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop; LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother; LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer; GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer; GridStopWatch PromoteTimer;
@@ -62,7 +63,12 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer; GridStopWatch SmoothTimer;
GridStopWatch InsertTimer; GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions // more most opertor functions
TwoLevelCGmrhs(RealD tol, TwoLevelCGmrhs(RealD tol,
Integer maxit, Integer maxit,
@@ -73,12 +79,313 @@ class TwoLevelCGmrhs
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _FineLinop(FineLinop),
_Smoother(Smoother) _Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{ {
grid = fine; grid = fine;
}; };
// Vector case // Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x) virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
SolveSingleSystem(src,x);
// SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); GRID_ASSERT(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
GRID_ASSERT(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{ {
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl; std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier(); src[0].Grid()->Barrier();
@@ -108,7 +415,7 @@ class TwoLevelCGmrhs
std::vector<RealD> src_nrm(nrhs); std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]); src_nrm[rhs]=norm2(src[rhs]);
assert(src_nrm[rhs]!=0.0); GRID_ASSERT(src_nrm[rhs]!=0.0);
} }
std::vector<RealD> tn(nrhs); std::vector<RealD> tn(nrhs);
@@ -361,15 +668,26 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs); CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs); CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
for(int rhs=0;rhs<nrhs;rhs++) { // this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start(); this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]); this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop(); this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start(); this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]); this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop(); this->FineTimer.Stop();
@@ -401,9 +719,11 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min] this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop(); this->PromoteTimer.Stop();
this->FineTimer.Start(); this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
} }
// this->zzz=out[0];
this->FineTimer.Stop(); this->FineTimer.Stop();
} }
}; };
+433
View File
@@ -0,0 +1,433 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/Arnoldi.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Patrick Oare <poare@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ARNOLDI_H
#define GRID_ARNOLDI_H
NAMESPACE_BEGIN(Grid);
//Moved to KrylovSchur
#if 0
/**
<<<<<<< HEAD
* Options for which Ritz values to keep in implicit restart.
*/
enum RitzFilter {
EvalNormSmall, // Keep evals with smallest norm
EvalNormLarge, // Keep evals with largest norm
EvalReSmall, // Keep evals with smallest real part
EvalReLarge // Keep evals with largest real part
};
// Select comparison function from RitzFilter
struct ComplexComparator
{
RitzFilter f;
ComplexComparator (RitzFilter _f) : f(_f) {}
bool operator()(std::complex<double> z1, std::complex<double> z2) {
switch (f) {
RealD tmp1, tmp2;
tmp1=std::abs(std::imag(z1));
tmp2=std::abs(std::imag(z2));
case EvalNormSmall:
return std::abs(z1) < std::abs(z2);
case EvalNormLarge:
return std::abs(z1) > std::abs(z2);
// Terrible hack
// return std::abs(std::real(z1)) < std::abs(std::real(z2));
// if ( std::abs(std::real(z1)) >4.) tmp1 +=1.;
// if ( std::abs(std::real(z2)) >4.) tmp2 +=1.;
case EvalReSmall:
return tmp1 < tmp2;
// return std::abs(std::imag(z1)) < std::abs(std::imag(z2));
case EvalReLarge:
return tmp1 > tmp2;
// return std::abs(std::real(z1)) > std::abs(std::real(z2));
default:
assert(0);
}
}
};
=======
>>>>>>> 68af1bba67dd62881ead5ab1e54962a5486a0791
#endif
/**
* Implementation of the Arnoldi algorithm.
*/
template<class Field>
class Arnoldi {
private:
std::string cname = std::string("Arnoldi");
int MaxIter; // Max iterations
RealD Tolerance;
RealD ssq;
RealD rtol;
int Nm; // Number of basis vectors to track (equals MaxIter if no restart)
int Nk; // Number of basis vectors to keep every restart (equals -1 if no restart)
int Nstop; // Stop after converging Nstop eigenvectors.
LinearOperatorBase<Field> &Linop;
GridBase *Grid;
RealD approxLambdaMax;
RealD beta_k;
Field f;
std::vector<Field> basis; // orthonormal Arnoldi basis
Eigen::MatrixXcd Hess; // Hessenberg matrix of size Nbasis (after construction)
Eigen::MatrixXcd Qt; // Transpose of basis rotation which projects out high modes.
Eigen::VectorXcd evals; // evals of Hess
Eigen::MatrixXcd littleEvecs; // Nm x Nm evecs matrix
std::vector<Field> evecs; // Vector of evec fields
RitzFilter ritzFilter; // how to sort evals
public:
Arnoldi(LinearOperatorBase<Field> &_Linop, GridBase *_Grid, RealD _Tolerance, RitzFilter filter = EvalReSmall)
: Linop(_Linop), Grid(_Grid), Tolerance(_Tolerance), ritzFilter(filter), f(_Grid), MaxIter(-1), Nm(-1), Nk(-1),
Nstop (-1), evals (0), evecs (), ssq (0.0), rtol (0.0), beta_k (0.0), approxLambdaMax (0.0)
{
f = Zero();
};
/**
* Runs the Arnoldi loop with(out) implicit restarting. For each iteration:
* - Runs an Arnoldi step.
* - Computes the eigensystem of the Hessenberg matrix.
* - Performs implicit restarting.
*/
void operator()(const Field& v0, int _maxIter, int _Nm, int _Nk, int _Nstop, bool doubleOrthog = false) {
MaxIter = _maxIter;
Nm = _Nm; Nk = _Nk;
Nstop = _Nstop;
ssq = norm2(v0);
RealD approxLambdaMax = approxMaxEval(v0);
rtol = Tolerance * approxLambdaMax;
ComplexComparator compareComplex (ritzFilter);
std::cout << GridLogMessage << "Comparing Ritz values with: " << ritzFilter << std::endl;
int start = 1;
Field startVec = v0;
littleEvecs = Eigen::MatrixXcd::Zero(Nm, Nm);
for (int i = 0; i < MaxIter; i++) {
std::cout << GridLogMessage << "Restart Iteration " << i << std::endl;
// Perform Arnoldi steps to compute Krylov basis and Rayleigh quotient (Hess)
arnoldiIteration(startVec, Nm, start, doubleOrthog);
startVec = f;
// compute eigensystem and sort evals
// compute_eigensystem();
compute_eigensystem(Hess);
std::cout << GridLogMessage << "Eigenvalues after Arnoldi step: " << std::endl << evals << std::endl;
std::sort(evals.begin(), evals.end(), compareComplex);
std::cout << GridLogMessage << "Ritz values after sorting (first Nk preserved): " << std::endl << evals << std::endl;
// SU(N)::tepidConfiguration
// Implicit restart to de-weight unwanted eigenvalues
implicitRestart(_Nm, _Nk); // probably can delete _Nm and _Nk from function args
start = Nk;
// check convergence and return if needed.
int Nconv = converged();
std::cout << GridLogMessage << "Number of evecs converged: " << Nconv << std::endl;
if (Nconv >= Nstop || i == MaxIter - 1) {
std::cout << GridLogMessage << "Converged with " << Nconv << " / " << Nstop << " eigenvectors on iteration "
<< i << "." << std::endl;
basisRotate(evecs, Qt, 0, Nk, 0, Nk, Nm);
std::cout << GridLogMessage << "Eigenvalues [first " << Nconv << " converged]: " << std::endl << evals << std::endl;
return;
}
}
}
/**
* Approximates the maximum eigenvalue of Linop.Op to normalize the residual and test for convergence.
*
* Parameters
* ----------
* Field& v0
* Source field to start with. Must have non-zero norm.
* int MAX_ITER (default = 50)
* Maximum number of iterations for power approximation.
*
* Returns
* -------
* RealD lamApprox
* Approximation of largest eigenvalue.
*/
RealD approxMaxEval(const Field& v0, int MAX_ITER = 50) {
assert (norm2(v0) > 1e-8); // must have relatively large source norm to start
RealD lamApprox = 0.0;
RealD denom = 1.0; RealD num = 1.0;
Field v0cp (Grid); Field tmp (Grid);
v0cp = v0;
denom = std::sqrt(norm2(v0cp));
for (int i = 0; i < MAX_ITER; i++) {
Linop.Op(v0cp, tmp); // CAREFUL: do not do Op(tmp, tmp)
v0cp = tmp;
num = std::sqrt(norm2(v0cp)); // num = |A^{n+1} v0|
lamApprox = num / denom; // lam = |A^{n+1} v0| / |A^n v0|
std::cout << GridLogDebug << "Approx for max eval: " << lamApprox << std::endl;
denom = num; // denom = |A^{n} v0|
}
return lamApprox;
}
/**
* Constructs the Arnoldi basis for the Krylov space K_n(D, src). (TODO make private)
*
* Parameters
* ----------
* v0 : Field&
* Source to generate Krylov basis.
* Nm : int
* Final size of the basis desired. If the basis becomes complete before a basis of size Nm is constructed
* (determined by relative tolerance Tolerance), stops iteration there.
* doubleOrthog : bool (default = false)
* Whether to double orthogonalize the basis (for numerical cancellations) or not.
* start : int (default = 0)
* If non-zero, assumes part of the Arnoldi basis has already been constructed.
*/
void arnoldiIteration(const Field& v0, int Nm, int start = 1, bool doubleOrthog = false)
{
ComplexD coeff;
Field w (Grid); // A acting on last Krylov vector.
if (start == 1) { // initialize everything that we need.
RealD v0Norm = 1 / std::sqrt(ssq);
basis.push_back(v0Norm * v0); // normalized source
Hess = Eigen::MatrixXcd::Zero(Nm, Nm);
f = Zero();
} else {
assert( start == basis.size() ); // should be starting at the end of basis (start = Nk)
Eigen::MatrixXcd HessCp = Hess;
Hess = Eigen::MatrixXcd::Zero(Nm, Nm);
Hess(Eigen::seqN(0, Nk), Eigen::seqN(0, Nk)) = HessCp;
}
// Construct next Arnoldi vector by normalizing w_i = Dv_i - \sum_j v_j h_{ji}
for (int i = start - 1; i < Nm; i++) {
Linop.Op(basis.back(), w);
for (int j = 0; j < basis.size(); j++) {
coeff = innerProduct(basis[j], w); // coeff = h_{ij}. Note that since {vi} is ONB it's OK to subtract it off after.
Hess(j, i) = coeff;
w -= coeff * basis[j];
}
if (doubleOrthog) {
// TODO implement
}
// add w_i to the pile
if (i < Nm - 1) {
coeff = std::sqrt(norm2(w));
Hess(i+1, i) = coeff;
basis.push_back(
(1.0/coeff) * w
);
}
// after iterations, update f and beta_k = ||f||
f = w; // make sure f is not normalized
beta_k = std::sqrt(norm2(f)); // beta_k = ||f_k|| determines convergence.
}
std::cout << GridLogMessage << "|f|^2 after Arnoldi step = " << norm2(f) << std::endl;
std::cout << GridLogDebug << "Computed Hessenberg matrix = " << std::endl << Hess << std::endl;
return;
}
/**
* Approximates the eigensystem of the linear operator by computing the eigensystem of
* the Hessenberg matrix. Assumes that the Hessenberg matrix has already been constructed (by
* calling the operator() function).
*
* TODO implement in parent class eventually.
*
* Parameters
* ----------
* Eigen::MatrixXcd& S
* Schur matrix (upper triangular) similar to original Rayleigh quotient.
*/
void compute_eigensystem(Eigen::MatrixXcd& S)
{
std::cout << GridLogMessage << "Computing eigenvalues." << std::endl;
evecs.clear();
Eigen::ComplexEigenSolver<Eigen::MatrixXcd> es;
es.compute(S);
evals = es.eigenvalues();
littleEvecs = es.eigenvectors();
// Convert evecs to lattice fields
for (int k = 0; k < evals.size(); k++) {
Eigen::VectorXcd vec = littleEvecs.col(k);
Field tmp (basis[0].Grid());
tmp = Zero();
for (int j = 0; j < basis.size(); j++) {
tmp = tmp + vec[j] * basis[j];
}
evecs.push_back(tmp);
}
std::cout << GridLogMessage << "Eigenvalues: " << std::endl << evals << std::endl;
}
/**
* Verifies the factorization DV = V^\dag H + f e^\dag with the last-computed
* V, H, f.
*/
// RealD verifyFactorization() {
// int k = basis.size(); // number of basis vectors, also the size of H.
// std::vector<Field> factorized (k, Zero());
// Field tmp (FGrid); tmp = Zero();
// for (int i = 0; i < basis.size(); i++) {
// Linop.Op(basis[i], tmp);
// }
// // basisRotate(basis, Q, 0, Nk, 0, Nk, Nm);
// // Linop.Op(, )
// }
/* Getters */
Eigen::MatrixXcd getHessenbergMat() { return Hess; }
Field getF() { return f; }
std::vector<Field> getBasis() { return basis; }
Eigen::VectorXcd getEvals() { return evals; }
std::vector<Field> getEvecs() { return evecs; }
/**
* Implements implicit restarting for Arnoldi. Assumes eigenvalues are sorted.
*
* Parameters
* ----------
* int _Nm
* Size of basis to keep (Hessenberg is MxM).
* int Nk
* Number of basis vectors to keep at each restart.
*/
void implicitRestart(int _Nm, int _Nk) {
assert ( _Nk <= _Nm );
Nm = _Nm; Nk = _Nk;
int Np = Nm - Nk; // keep Nk smallest (or largest, depends on sort function) evecs
std::cout << GridLogMessage << "Computing QR Factorizations." << std::endl;
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Identity(Nm, Nm);
Eigen::MatrixXcd Qi (Nm, Nm);
Eigen::MatrixXcd R (Nm, Nm);
for (int i = Nk; i < Nm; i++) { // keep the first Nk eigenvalues and iterate through the last Np. Should loop Np times
// Useful debugging output
std::cout << GridLogDebug << "Computing QR factorization for i = " << i << std::endl;
std::cout << GridLogDebug << "Eval shift = " << evals[i] << std::endl;
std::cout << GridLogDebug << "Hess before rotation: " << Hess << std::endl;
// QR factorize
Eigen::HouseholderQR<Eigen::MatrixXcd> QR (Hess - evals[i] * Eigen::MatrixXcd::Identity(Nm, Nm));
Qi = QR.householderQ();
Q = Q * Qi;
Hess = Qi.adjoint() * Hess * Qi;
std::cout << GridLogDebug << "Qt up to i = " << Q.transpose() << std::endl;
}
std::cout << GridLogDebug << "Hess after all rotations: " << std::endl << Hess << std::endl;
// form Arnoldi vector f: f is normal to the basis vectors and its norm \beta is used to determine the Ritz estimate.
std::complex<double> beta = Hess(Nk, Nk-1);
std::complex<double> sigma = Q(Nm-1, Nk-1);
f = basis[Nk] * beta + f * sigma;
RealD betak = std::sqrt(norm2(f));
std::cout << GridLogMessage << "|f|^2 after implicit restart = " << norm2(f) << std::endl;
// Rotate basis by Qt
Qt = Q.transpose();
basisRotate(basis, Qt, 0, Nk + 1, 0, Nm, Nm);
// rotate
basisRotate(evecs, Qt, 0, Nk + 1, 0, Nm, Nm);
// Truncate the basis and restart
basis = std::vector<Field> (basis.begin(), basis.begin() + Nk);
// evecs = std::vector<Field> (evecs.begin(), evecs.begin() + Nk);
Hess = Hess(Eigen::seqN(0, Nk), Eigen::seqN(0, Nk));
std::cout << "evecs size: " << evecs.size() << std::endl;
}
/**
* Computes the number of Arnoldi eigenvectors that have converged. An eigenvector s is considered converged
* for a tolerance epsilon if
* r(s) := |\beta e_m^T s| < epsilon
* where beta is the norm of f_{m+1}.
*
* Parameters
* ----------
*
* Returns
* -------
* int : Number of converged eigenvectors.
*/
int converged() {
int Nconv = 0;
for (int k = 0; k < evecs.size(); k++) {
RealD emTs = std::abs(littleEvecs(Nm - 1, k)); // e_m^T s
RealD ritzEstimate = beta_k * emTs;
// TODO should be ritzEstimate < Tolerance * lambda_max
std::cout << GridLogMessage << "Ritz estimate for evec " << k << " = " << ritzEstimate << std::endl;
if (ritzEstimate < rtol) {
Nconv++;
}
}
return Nconv;
}
};
NAMESPACE_END(Grid);
#endif
+4 -4
View File
@@ -47,7 +47,7 @@ class BiCGSTAB : public OperatorFunction<Field>
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -77,7 +77,7 @@ class BiCGSTAB : public OperatorFunction<Field>
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
Linop.Op(psi, v); Linop.Op(psi, v);
b = norm2(v); b = norm2(v);
@@ -214,7 +214,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); } if(ErrorOnNoConverge){ GRID_ASSERT(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k; IterationsToComplete = k;
@@ -224,7 +224,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl; std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ assert(0); } if(ErrorOnNoConverge){ GRID_ASSERT(0); }
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
@@ -31,6 +31,58 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -46,7 +98,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
int Nblock; int Nblock;
BlockCGtype CGtype; BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related // Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R) const std::vector<Field> & R)
{ {
InnerProductMatrix(m_rr,R,R); InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
@@ -131,7 +201,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
} else if (CGtype == CGmultiRHS ) { } else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi); CGmultiRHSsolve(Linop,Src,Psi);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
@@ -139,7 +209,7 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
if ( CGtype == BlockCGrQVec ) { if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi); BlockCGrQsolveVec(Linop,Src,Psi);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -186,12 +256,13 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog); sliceNorm(ssq,B,Orthog);
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog); sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog); sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q; D=Q;
@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
RealD max_resid=0;
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/ */
m_rr = m_C.adjoint() * m_C; m_rr = m_C.adjoint() * m_C;
RealD max_resid=0; max_resid=0;
RealD rrsum=0; RealD rrsum=0;
RealD rr; RealD rr;
@@ -322,9 +398,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
} }
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0); std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -360,10 +438,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog); sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog); sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
// Initial search dir is guess // Initial search dir is guess
Linop.HermOp(Psi, AP); Linop.HermOp(Psi, AP);
@@ -462,47 +540,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
} }
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl; std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation: // BlockCGrQvec implementation:
//-------------------------- //--------------------------
@@ -513,7 +554,7 @@ double normv(const std::vector<Field> &P){
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{ {
Nblock = B.size(); Nblock = B.size();
assert(Nblock == X.size()); GRID_ASSERT(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
@@ -549,13 +590,14 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) { for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]); Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b]; tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
} }
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@@ -688,7 +731,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl; std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
@@ -36,7 +36,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when CAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -82,7 +82,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -137,7 +137,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -185,7 +185,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }
+140 -4
View File
@@ -38,13 +38,14 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -57,10 +58,22 @@ public:
ErrorOnNoConverge(err_on_no_conv) ErrorOnNoConverge(err_on_no_conv)
{}; {};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient"); GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer; GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start(); PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
@@ -70,14 +83,19 @@ public:
//RealD b_pred; //RealD b_pred;
// Was doing copies // Was doing copies
ConstructTimer.Start();
Field p (src.Grid()); Field p (src.Grid());
Field mmp(src.Grid()); Field mmp(src.Grid());
Field r (src.Grid()); Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up // Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src); ssq = norm2(src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); NormTimer.Stop();
GRID_ASSERT(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) { if ( guess == 0.0 ) {
r = src; r = src;
p = r; p = r;
@@ -89,6 +107,7 @@ public:
a = norm2(p); a = norm2(p);
} }
cp = a; cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src // Handle trivial case of zero src
if (ssq == 0.){ if (ssq == 0.){
@@ -164,6 +183,7 @@ public:
} }
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop(); IterationTimer.Stop();
if ( (k % 500) == 0 ) { if ( (k % 500) == 0 ) {
@@ -202,7 +222,7 @@ public:
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
TrueResidual = true_residual; TrueResidual = true_residual;
@@ -220,6 +240,9 @@ public:
<<" residual "<< std::sqrt(cp / ssq)<< std::endl; <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop(); SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl; std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
@@ -228,10 +251,123 @@ public:
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);
@@ -77,7 +77,7 @@ public:
} }
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size()); GRID_ASSERT(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size(); int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
@@ -98,15 +98,15 @@ public:
std::vector<RealD> alpha(nshift,1.0); std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift); GRID_ASSERT(psi.size()==nshift);
assert(mass.size()==nshift); GRID_ASSERT(mass.size()==nshift);
assert(mresidual.size()==nshift); GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // remove dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;
@@ -122,7 +122,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -338,7 +338,7 @@ public:
} }
// ugly hack // ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0); // GRID_ASSERT(0);
} }
}; };
@@ -118,16 +118,16 @@ public:
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift); GRID_ASSERT(psi_d.size()==nshift);
assert(mass.size()==nshift); GRID_ASSERT(mass.size()==nshift);
assert(mresidual.size()==nshift); GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD rsqf[nshift]; std::vector<RealD> rsqf(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;
@@ -141,7 +141,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -179,7 +179,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4); // GRID_ASSERT(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -365,7 +365,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };
@@ -48,12 +48,12 @@ public:
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){} ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); } void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); } void OpDir (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); } void OpDirAll (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); }
void Op (const Field &in, Field &out){ assert(0); } void Op (const Field &in, Field &out){ GRID_ASSERT(0); }
void AdjOp (const Field &in, Field &out){ assert(0); } void AdjOp (const Field &in, Field &out){ GRID_ASSERT(0); }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out); linop_base.HermOp(in, out);
@@ -151,16 +151,16 @@ public:
FieldD r_d(DoublePrecGrid); FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift); GRID_ASSERT(psi_d.size()==nshift);
assert(mass.size()==nshift); GRID_ASSERT(mass.size()==nshift);
assert(mresidual.size()==nshift); GRID_ASSERT(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; std::vector<RealD> bs(nshift);
RealD rsq[nshift]; std::vector<RealD> rsq(nshift);
RealD rsqf[nshift]; std::vector<RealD> rsqf(nshift);
RealD z[nshift][2]; std::vector<std::array<RealD,2> > z(nshift);
int converged[nshift]; std::vector<int> converged(nshift);
const int primary =0; const int primary =0;
@@ -174,7 +174,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); GRID_ASSERT( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0); GRID_ASSERT(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -408,7 +408,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };
@@ -35,7 +35,7 @@ template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public: public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -66,7 +66,7 @@ public:
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{ {
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); GRID_ASSERT(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
}; };
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
@@ -90,7 +90,7 @@ public:
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b); Linop_d.HermOpAndNorm(psi, mmp, d, b);
@@ -217,7 +217,7 @@ public:
CG(Linop_d,src,psi); CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete; IterationsToCleanup = CG.IterationsToComplete;
} }
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); else if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return; return;
@@ -263,7 +263,7 @@ public:
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl; << std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
} }
@@ -0,0 +1,277 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientTimeslice.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_TIMESLICE_H
#define GRID_CONJUGATE_GRADIENT_TIMESLICE_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/**
* Simple modification of conjugate gradient that outputs the residual as a function
* of time, in order to study the large wavelength behavior of the solver.
*/
template <class Field>
class ConjugateGradientTimeslice : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
RealD TrueResidual;
ConjugateGradientTimeslice(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv)
{};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradientTimeslice");
GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
// Was doing copies
ConstructTimer.Start();
Field p (src.Grid());
Field mmp(src.Grid());
Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src); // Norm of source vector ||b||^2
ssqtx = localNorm2(src); // Norm |b(x, t)|^2 as a field
std::vector<RealD> ssqt; // Norm of source not summed over time slices, ssq(t) = \sum_x |b(x, t)|^2
sliceSum(ssqtx, ssqt, Tdir); // TODO make sure Tdir is globally defined
RealD guess = norm2(psi); // Norm of initial guess ||psi||^2
NormTimer.Stop();
assert(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) {
r = src;
p = r;
a = ssq;
} else {
Linop.HermOpAndNorm(psi, mmp, d, b); //
r = src - mmp; // Initial residual r0 = b - A guess
p = r; // initial conj vector p0 = r0
a = norm2(p);
}
cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src
if (ssq == 0.){
psi = Zero();
IterationsToComplete = 1;
TrueResidual = 0.;
return;
}
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
TrueResidual = std::sqrt(a/ssq);
std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
IterationsToComplete = 0;
return;
}
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
PreambleTimer.Stop();
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
RealD usecs = -usecond();
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
GridStopWatch IterationTimer;
IterationTimer.Start();
c = cp;
MatrixTimer.Start();
Linop.HermOp(p, mmp); // Computes mmp = Ap
MatrixTimer.Stop();
LinalgTimer.Start();
InnerTimer.Start();
ComplexD dc = innerProduct(p,mmp); // p^\dagger A p
InnerTimer.Stop();
d = dc.real();
a = c / d;
// What is axpy? Some accelerator or something? Check Lattice_arith.h
AxpyNormTimer.Start();
// axpy_norm computes ax+by for vectors x and y compatible with a GPU. Here b is set to 1 (see the function in Lattice_reduction.h).
// The first argument passes r by reference, so it stores r --> -a * Ap + 1 * r, i.e. it performs an update on
// r_k --> r_{k+1} = r_k - \alpha_k A p_k. The function returns the norm squared of the first variable, i.e. ||r_{k+1}||^2.
cp = axpy_norm(r, -a, mmp, r);
AxpyNormTimer.Stop();
b = cp / c;
LinearCombTimer.Start();
{
autoView( psi_v , psi, AcceleratorWrite);
autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop();
if ( (k % 500) == 0 ) {
std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
} else {
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
}
// Stopping condition
if (cp <= rsq) {
usecs +=usecond();
SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
GridBase *grid = src.Grid();
RealD DwfFlops = (1452. )*grid->gSites()*4*k
+ (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
<< "\tComputed residual " << std::sqrt(cp / ssq)
<< "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl;
// GridLogMessage logs the message to the terminal output; GridLogPerformance probably writes to a log file?
// std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver Elapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
TrueResidual = true_residual;
return;
}
}
// Failed. Calculate true residual before giving up
// Linop.HermOpAndNorm(psi, mmp, d, qq);
// p = mmp - src;
//TrueResidual = sqrt(norm2(p)/ssq);
// TrueResidual = 1;
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage<< "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
NAMESPACE_END(Grid);
#endif
@@ -106,7 +106,7 @@ public:
} }
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
@@ -36,7 +36,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FCAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -87,7 +87,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -144,7 +144,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -191,7 +191,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }
@@ -36,7 +36,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -85,7 +85,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -142,7 +142,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -189,7 +189,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }
@@ -36,7 +36,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when GMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -80,7 +80,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -135,7 +135,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -181,7 +181,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }
@@ -175,7 +175,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation),split_test(0), diagonalisation(_diagonalisation),split_test(0),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; { GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -206,7 +206,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
assert(normalize(w,if_print) != 0); GRID_ASSERT(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -225,7 +225,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
assert(normalize(w[i],if_print) !=0); GRID_ASSERT(normalize(w[i],if_print) !=0);
} }
@@ -244,7 +244,7 @@ public:
const uint64_t sites = grid->lSites(); const uint64_t sites = grid->lSites();
int Nbatch = R/Nevec_acc; int Nbatch = R/Nevec_acc;
assert( R%Nevec_acc == 0 ); GRID_ASSERT( R%Nevec_acc == 0 );
// Glog << "nBatch, Nevec_acc, R, Nu = " // Glog << "nBatch, Nevec_acc, R, Nu = "
// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; // << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
@@ -302,7 +302,7 @@ public:
} }
} }
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
assert(normalize(w[i],do_print)!=0); GRID_ASSERT(normalize(w[i],do_print)!=0);
} }
Glog << "cuBLAS Zgemm done"<< std::endl; Glog << "cuBLAS Zgemm done"<< std::endl;
@@ -374,8 +374,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -396,7 +396,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -579,8 +579,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -607,7 +607,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -785,7 +785,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
assert( b < Nm/Nu ); GRID_ASSERT( b < Nm/Nu );
// GridCartesian *grid = evec[0]._grid; // GridCartesian *grid = evec[0]._grid;
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
@@ -796,7 +796,7 @@ private:
Glog << "Using split grid"<< std::endl; Glog << "Using split grid"<< std::endl;
// LatticeGaugeField s_Umu(SGrid); // LatticeGaugeField s_Umu(SGrid);
assert((Nu%mrhs)==0); GRID_ASSERT((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
Field s_in(sf_grid); Field s_in(sf_grid);
@@ -906,7 +906,7 @@ if(split_test){
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
assert (!isnan(norm2(w[u]))); GRID_ASSERT (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
@@ -929,8 +929,8 @@ if(split_test){
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -970,8 +970,8 @@ if(split_test){
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -1119,7 +1119,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -1131,8 +1131,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -1159,8 +1159,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -121,7 +121,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation), diagonalisation(_diagonalisation),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ assert( (Nk%Nu==0) && (Nm%Nu==0) ); }; { GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -151,7 +151,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
assert(normalize(w,if_print) != 0); GRID_ASSERT(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -169,7 +169,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
assert(normalize(w[i],if_print) !=0); GRID_ASSERT(normalize(w[i],if_print) !=0);
} }
void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu) void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
@@ -205,8 +205,8 @@ public:
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -227,7 +227,7 @@ public:
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -279,16 +279,16 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm); Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
_sort.push(eval2,Nm); _sort.push(eval2,Nm);
Glog << "#Ritz value before shift: "<< std::endl; // Glog << "#Ritz value before shift: "<< std::endl;
for(int i=0; i<Nm; ++i){ for(int i=0; i<Nm; ++i){
std::cout.precision(13); // std::cout.precision(13);
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; // std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; // std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
} }
//---------------------------------------------------------------------- //----------------------------------------------------------------------
if ( Nm>Nk ) { if ( Nm>Nk ) {
Glog <<" #Apply shifted QR transformations "<<std::endl; // Glog <<" #Apply shifted QR transformations "<<std::endl;
//int k2 = Nk+Nu; //int k2 = Nk+Nu;
int k2 = Nk; int k2 = Nk;
@@ -326,7 +326,7 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm); Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
_sort.push(eval2,Nk); _sort.push(eval2,Nk);
Glog << "#Ritz value after shift: "<< std::endl; // Glog << "#Ritz value after shift: "<< std::endl;
for(int i=0; i<Nk; ++i){ for(int i=0; i<Nk; ++i){
// std::cout.precision(13); // std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; // std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@@ -413,8 +413,8 @@ public:
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
assert(grid == src[0].Grid()); GRID_ASSERT(grid == src[0].Grid());
assert( Nu = src.size() ); GRID_ASSERT( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -441,7 +441,7 @@ public:
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
assert(Nm == evec.size() && Nm == eval.size()); GRID_ASSERT(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -622,7 +622,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
assert( b < Nm/Nu ); GRID_ASSERT( b < Nm/Nu );
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
int L = Nu*b; int L = Nu*b;
@@ -630,7 +630,7 @@ private:
Real beta; Real beta;
assert((Nu%mrhs)==0); GRID_ASSERT((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
std::vector<Field> out(mrhs,f_grid); std::vector<Field> out(mrhs,f_grid);
@@ -644,7 +644,7 @@ private:
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; // for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
k_start +=mrhs; k_start +=mrhs;
} }
Glog << "LinAlg "<< std::endl; // Glog << "LinAlg "<< std::endl;
if (b>0) { if (b>0) {
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
@@ -678,7 +678,7 @@ private:
} }
w_copy[u] = w[u]; w_copy[u] = w[u];
} }
Glog << "LinAlg done"<< std::endl; // Glog << "LinAlg done"<< std::endl;
// In block version, the steps 6 and 7 in Lanczos construction is // In block version, the steps 6 and 7 in Lanczos construction is
// replaced by the QR decomposition of new basis block. // replaced by the QR decomposition of new basis block.
@@ -691,15 +691,15 @@ private:
} }
// re-orthogonalization for numerical stability // re-orthogonalization for numerical stability
Glog << "Gram Schmidt"<< std::endl; // Glog << "Gram Schmidt"<< std::endl;
orthogonalize(w,Nu,evec,R); orthogonalize(w,Nu,evec,R);
// QR part // QR part
for (int u=1; u<Nu; ++u) { for (int u=1; u<Nu; ++u) {
orthogonalize(w[u],w,u); orthogonalize(w[u],w,u);
} }
Glog << "Gram Schmidt done "<< std::endl; // Glog << "Gram Schmidt done "<< std::endl;
Glog << "LinAlg "<< std::endl; // Glog << "LinAlg "<< std::endl;
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
//for (int v=0; v<Nu; ++v) { //for (int v=0; v<Nu; ++v) {
for (int v=u; v<Nu; ++v) { for (int v=u; v<Nu; ++v) {
@@ -711,12 +711,12 @@ private:
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
assert (!isnan(norm2(w[u]))); GRID_ASSERT (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; // Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
} }
Glog << "LinAlg done "<< std::endl; // Glog << "LinAlg done "<< std::endl;
if (b < Nm/Nu-1) { if (b < Nm/Nu-1) {
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
@@ -734,8 +734,8 @@ private:
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -775,8 +775,8 @@ private:
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -924,7 +924,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -935,9 +935,9 @@ if (1){
int Nu, int Nb, int Nk, int Nm, int Nu, int Nb, int Nk, int Nm,
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; // Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -953,7 +953,7 @@ if (1){
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
} }
} }
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; // Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
} }
@@ -963,9 +963,9 @@ if (1){
int Nu, int Nb, int Nk, int Nm, int Nu, int Nb, int Nk, int Nm,
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; // Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
assert( Nk%Nu == 0 && Nm%Nu == 0 ); GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 );
assert( Nk <= Nm ); GRID_ASSERT( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -979,7 +979,7 @@ if (1){
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
} }
} }
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; // Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
} }
@@ -988,7 +988,7 @@ if (1){
RealD Dsh, RealD Dsh,
Eigen::MatrixXcd& Qprod) Eigen::MatrixXcd& Qprod)
{ {
Glog << "shiftedQRDecompEigen() begin" << '\n'; // Glog << "shiftedQRDecompEigen() begin" << '\n';
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1004,7 +1004,7 @@ if (1){
// lower triangular part used to represent series // lower triangular part used to represent series
// of Q sequence. // of Q sequence.
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; // Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
// equivalent operation of Qprod *= Q // equivalent operation of Qprod *= Q
//M = Eigen::MatrixXcd::Zero(Nm,Nm); //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1025,7 +1025,7 @@ if (1){
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; // Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=0; j<Nm-(Nu+1); ++j) { for (int j=0; j<Nm-(Nu+1); ++j) {
for (int k=0; k<Nu+1+j; ++k) { for (int k=0; k<Nu+1+j; ++k) {
@@ -1033,7 +1033,7 @@ if (1){
} }
} }
} }
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; // Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=Nm-(Nu+1); j<Nm; ++j) { for (int j=Nm-(Nu+1); j<Nm; ++j) {
for (int k=0; k<Nm; ++k) { for (int k=0; k<Nm; ++k) {
@@ -1041,7 +1041,7 @@ if (1){
} }
} }
} }
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; // Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
//static int ntimes = 2; //static int ntimes = 2;
//for (int j=0; j<Nm-(ntimes*Nu); ++j) { //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1067,13 +1067,13 @@ if (1){
Mtmp(j,i) = conj(Mtmp(i,j)); Mtmp(j,i) = conj(Mtmp(i,j));
} }
} }
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; // Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
} }
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; // Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
M = Mtmp; M = Mtmp;
//M = Q.adjoint()*(M*Q); //M = Q.adjoint()*(M*Q);
@@ -1085,7 +1085,7 @@ if (1){
// } // }
//} //}
Glog << "shiftedQRDecompEigen() end" <<std::endl; // Glog << "shiftedQRDecompEigen() end" <<std::endl;
} }
void exampleQRDecompEigen(void) void exampleQRDecompEigen(void)
@@ -53,6 +53,18 @@ enum IRLdiagonalisation {
IRLdiagonaliseWithEigen IRLdiagonaliseWithEigen
}; };
enum IRLeigsort {
IRLeigsortMax,
IRLeigsortSqMin
};
#if 0
bool square_comp(RealD a, RealD b){
if (a*a<b*b) return true;
return false;
}
#endif
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field> template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{ {
public: public:
@@ -119,8 +131,9 @@ class ImplicitlyRestartedLanczos {
///////////////////////// /////////////////////////
// Constructor // Constructor
///////////////////////// /////////////////////////
public: public:
IRLeigsort EigSort;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// PAB: // PAB:
@@ -154,6 +167,7 @@ public:
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp), eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart), MaxIter(_MaxIter) , MinRestart(_MinRestart),
EigSort(IRLeigsortMax),
orth_period(_orth_period), diagonalisation(_diagonalisation) { }; orth_period(_orth_period), diagonalisation(_diagonalisation) { };
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp, ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
@@ -170,6 +184,7 @@ public:
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp), eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart), MaxIter(_MaxIter) , MinRestart(_MinRestart),
EigSort(IRLeigsortMax),
orth_period(_orth_period), diagonalisation(_diagonalisation) { }; orth_period(_orth_period), diagonalisation(_diagonalisation) { };
//////////////////////////////// ////////////////////////////////
@@ -211,7 +226,7 @@ until convergence
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
assert(grid == evec[0].Grid()); GRID_ASSERT(grid == evec[0].Grid());
// GridLogIRL.TimingMode(1); // GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -231,7 +246,7 @@ until convergence
} }
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm <= evec.size() && Nm <= eval.size()); GRID_ASSERT(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
@@ -245,9 +260,10 @@ until convergence
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0); // std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; // std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. // RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
@@ -255,6 +271,7 @@ until convergence
src_n = tmp; src_n = tmp;
} }
} }
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm); std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm); std::vector<RealD> lme2(Nm);
@@ -314,8 +331,12 @@ until convergence
// sorting // sorting
////////////////////////////////// //////////////////////////////////
eval2_copy = eval2; eval2_copy = eval2;
// if (EigSort==IRLeigsortMax)
// std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),square_comp);
// else
std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>()); std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
std::cout<<GridLogIRL <<" evals sorted "<<std::endl; std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
// eval2_copy = eval2;
const int chunk=8; const int chunk=8;
for(int io=0; io<k2;io+=chunk){ for(int io=0; io<k2;io+=chunk){
std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ; std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
@@ -331,11 +352,12 @@ until convergence
////////////////////////////////// //////////////////////////////////
Qt = Eigen::MatrixXd::Identity(Nm,Nm); Qt = Eigen::MatrixXd::Identity(Nm,Nm);
for(int ip=k2; ip<Nm; ++ip){ for(int ip=k2; ip<Nm; ++ip){
// std::cout<<GridLogIRL <<"QR decompose "<<eval2[ip]<<std::endl;
QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm); QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
} }
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl; std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
assert(k2<Nm); assert(k2<Nm); assert(k1>0); GRID_ASSERT(k2<Nm); GRID_ASSERT(k2<Nm); GRID_ASSERT(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
@@ -373,7 +395,8 @@ until convergence
// power of two search pattern; not every evalue in eval2 is assessed. // power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1; int allconv =1;
for(int jj = 1; jj<=Nstop; jj*=2){ // for(int jj = 1; jj<=Nstop; jj*=2){
for(int jj = 1; jj<=Nstop; jj++){
int j = Nstop-jj; int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm); basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
@@ -461,7 +484,7 @@ until convergence
{ {
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl; std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
assert( k< Nm ); GRID_ASSERT( k< Nm );
GridStopWatch gsw_op,gsw_o; GridStopWatch gsw_op,gsw_o;
@@ -595,7 +618,7 @@ until convergence
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) { } else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -685,7 +708,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
} }
} }
#else #else
assert(0); GRID_ASSERT(0);
#endif #endif
} }
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,276 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./Grid/algorithms/iterative/LanczosBidiagonalization.h
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LANCZOS_BIDIAGONALIZATION_H
#define GRID_LANCZOS_BIDIAGONALIZATION_H
NAMESPACE_BEGIN(Grid);
/**
* Lanczos Bidiagonalization (Golub-Kahan)
*
* For a linear operator A with adjoint A^dag, constructs the bidiagonal
* decomposition:
*
* A V_m = U_m B_m
* A^dag U_m = V_m B_m^T + beta_{m+1} v_{m+1} e_m^T
*
* where:
* V_m = [v_1, ..., v_m] right Lanczos vectors (orthonormal)
* U_m = [u_1, ..., u_m] left Lanczos vectors (orthonormal)
* B_m is upper bidiagonal with diag(alpha_1,...,alpha_m) and
* superdiag(beta_2,...,beta_m)
*
* The singular values of A are approximated by those of B_m.
* The singular values of B_m are the square roots of the eigenvalues of
* the symmetric tridiagonal matrix B_m^T B_m.
*
* Usage:
* LanczosBidiagonalization<Field> lb(Linop, grid);
* lb.run(src, Nm, tol);
* // Access results via getters.
*/
template <class Field>
class LanczosBidiagonalization {
public:
LinearOperatorBase<Field> &Linop;
GridBase *Grid;
int Nm; // number of Lanczos steps taken
RealD Tolerance; // convergence threshold on beta_{k+1} / alpha_k
std::vector<Field> V; // right Lanczos vectors v_1 ... v_m
std::vector<Field> U; // left Lanczos vectors u_1 ... u_m
std::vector<RealD> alpha; // diagonal of bidiagonal matrix
std::vector<RealD> beta; // super-diagonal (beta[k] couples u_k and v_{k+1})
// SVD of the bidiagonal matrix (filled after computeSVD())
Eigen::VectorXd singularValues;
Eigen::MatrixXd leftSVecs; // columns are left singular vectors of B
Eigen::MatrixXd rightSVecs; // columns are right singular vectors of B
public:
LanczosBidiagonalization(LinearOperatorBase<Field> &_Linop, GridBase *_Grid,
RealD _tol = 1.0e-8)
: Linop(_Linop), Grid(_Grid), Tolerance(_tol), Nm(0)
{}
/**
* Run the Golub-Kahan Lanczos bidiagonalization.
*
* Parameters
* ----------
* src : starting vector (need not be normalised)
* Nmax : maximum number of Lanczos steps
* reorth : if true, full reorthogonalisation of both V and U bases
*/
void run(const Field &src, int Nmax, bool reorth = true)
{
assert(norm2(src) > 0.0);
V.clear(); U.clear();
alpha.clear(); beta.clear();
Nm = 0;
Field p(Grid), r(Grid);
// --- initialise: v_1 = src / ||src|| ---
Field v(Grid);
v = src;
RealD nrm = std::sqrt(norm2(v));
v = (1.0 / nrm) * v;
V.push_back(v);
for (int k = 0; k < Nmax; ++k) {
// p = A v_k
Linop.Op(V[k], p);
// p = p - beta_k * u_{k-1} (remove previous left vector)
if (k > 0) {
p = p - beta[k-1] * U[k-1];
}
// alpha_k = ||p||
RealD ak = std::sqrt(norm2(p));
if (ak < 1.0e-14) {
std::cout << GridLogMessage
<< "LanczosBidiagonalization: lucky breakdown at step "
<< k << " (alpha = " << ak << ")" << std::endl;
break;
}
alpha.push_back(ak);
// u_k = p / alpha_k
Field u(Grid);
u = (1.0 / ak) * p;
// full reorthogonalisation of u against previous U
if (reorth) {
for (int j = 0; j < (int)U.size(); ++j) {
ComplexD ip = innerProduct(U[j], u);
u = u - ip * U[j];
}
RealD unrm = std::sqrt(norm2(u));
if (unrm > 1.0e-14) u = (1.0 / unrm) * u;
}
U.push_back(u);
// r = A^dag u_k - alpha_k * v_k
Linop.AdjOp(U[k], r);
r = r - ak * V[k];
// full reorthogonalisation of r against previous V
if (reorth) {
for (int j = 0; j < (int)V.size(); ++j) {
ComplexD ip = innerProduct(V[j], r);
r = r - ip * V[j];
}
}
// beta_{k+1} = ||r||
RealD bk = std::sqrt(norm2(r));
beta.push_back(bk);
Nm = k + 1;
std::cout << GridLogMessage
<< "LanczosBidiagonalization step " << k
<< " alpha = " << ak
<< " beta = " << bk << std::endl;
// convergence: residual beta / alpha small enough
if (bk / ak < Tolerance) {
std::cout << GridLogMessage
<< "LanczosBidiagonalization converged at step " << k
<< " (beta/alpha = " << bk / ak << ")" << std::endl;
break;
}
if (k == Nmax - 1) break; // no v_{k+2} needed after last step
// v_{k+1} = r / beta_{k+1}
Field vnext(Grid);
vnext = (1.0 / bk) * r;
V.push_back(vnext);
}
}
/**
* Compute the SVD of the bidiagonal matrix B using Eigen.
* Singular values are stored in descending order.
*/
void computeSVD()
{
int m = Nm;
Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
for (int k = 0; k < m; ++k) {
B(k, k) = alpha[k];
if (k + 1 < m && k < (int)beta.size())
B(k, k+1) = beta[k];
}
Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
Eigen::ComputeThinU | Eigen::ComputeThinV);
singularValues = svd.singularValues(); // already sorted descending
leftSVecs = svd.matrixU();
rightSVecs = svd.matrixV();
std::cout << GridLogMessage
<< "LanczosBidiagonalization: singular values of B_" << m
<< std::endl;
for (int k = 0; k < m; ++k)
std::cout << GridLogMessage << " sigma[" << k << "] = "
<< singularValues(k) << std::endl;
}
/**
* Return the k-th approximate left singular vector of A in the full
* lattice space. computeSVD() must have been called first.
*/
Field leftSingularVector(int k)
{
assert(k < (int)leftSVecs.cols());
Field svec(Grid);
svec = Zero();
for (int j = 0; j < Nm; ++j)
svec = svec + leftSVecs(j, k) * U[j];
return svec;
}
/**
* Return the k-th approximate right singular vector of A in the full
* lattice space. computeSVD() must have been called first.
*/
Field rightSingularVector(int k)
{
assert(k < (int)rightSVecs.cols());
Field svec(Grid);
svec = Zero();
for (int j = 0; j < Nm; ++j)
svec = svec + rightSVecs(j, k) * V[j];
return svec;
}
/**
* Verify the bidiagonalization: returns max residual
* max_k || A v_k - alpha_k u_k - beta_k u_{k-1} ||
*/
RealD verify()
{
Field tmp(Grid);
RealD maxres = 0.0;
for (int k = 0; k < Nm; ++k) {
Linop.Op(V[k], tmp);
tmp = tmp - alpha[k] * U[k];
if (k > 0 && k-1 < (int)beta.size())
tmp = tmp - beta[k-1] * U[k-1];
RealD res = std::sqrt(norm2(tmp));
if (res > maxres) maxres = res;
std::cout << GridLogMessage
<< "LanczosBidiagonalization verify step " << k
<< " ||A v_k - alpha_k u_k - beta_{k-1} u_{k-1}|| = "
<< res << std::endl;
}
return maxres;
}
/* Getters */
int getNm() const { return Nm; }
const std::vector<Field>& getV() const { return V; }
const std::vector<Field>& getU() const { return U; }
const std::vector<RealD>& getAlpha() const { return alpha; }
const std::vector<RealD>& getBeta() const { return beta; }
Eigen::VectorXd getSingularValues() const { return singularValues; }
};
NAMESPACE_END(Grid);
#endif
@@ -80,7 +80,7 @@ public:
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace) _Linop(linop), subspace(_subspace)
{ {
assert(subspace.size() >0); GRID_ASSERT(subspace.size() >0);
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
@@ -346,12 +346,12 @@ public:
void testFine(RealD resid) void testFine(RealD resid)
{ {
assert(evals_fine.size() == nbasis); GRID_ASSERT(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis); GRID_ASSERT(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op); ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){ for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1); GRID_ASSERT(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
} }
} }
@@ -359,8 +359,8 @@ public:
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
assert(evals_fine.size() == nbasis); GRID_ASSERT(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis); GRID_ASSERT(subspace.size() == nbasis);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -380,7 +380,7 @@ public:
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
assert(nbasis<=Nm); GRID_ASSERT(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms); Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp); FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
@@ -400,8 +400,8 @@ public:
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved // Shrink down to number saved
assert(Nstop>=nbasis); GRID_ASSERT(Nstop>=nbasis);
assert(Nconv>=nbasis); GRID_ASSERT(Nconv>=nbasis);
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
@@ -433,7 +433,7 @@ public:
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop); GRID_ASSERT(Nconv>=Nstop);
evals_coarse.resize(Nstop); evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid); evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){ for (int i=0;i<Nstop;i++){
+4 -4
View File
@@ -35,7 +35,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge. bool ErrorOnNoConverge; // throw an GRID_ASSERT when the MR fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -59,7 +59,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
@@ -136,7 +136,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(true_residual / Tolerance < 10000.0); GRID_ASSERT(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
@@ -148,7 +148,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
<< std::endl; << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
@@ -37,7 +37,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
using OperatorFunction<FieldD>::operator(); using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge, bool ErrorOnNoConverge; // Throw an GRID_ASSERT when MPFGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -91,7 +91,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); GRID_ASSERT(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -150,7 +150,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl; std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
assert(0); GRID_ASSERT(0);
} }
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
@@ -197,7 +197,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
} }
} }
assert(0); // Never reached GRID_ASSERT(0); // Never reached
return cp; return cp;
} }
@@ -60,6 +60,32 @@ public:
} }
}; };
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> { template<class Field> class HPDSolver : public LinearFunction<Field> {
private: private:
LinearOperatorBase<Field> & _Matrix; LinearOperatorBase<Field> & _Matrix;
+8 -9
View File
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
const int _MAX_ITER_EST_ = 100; const int _MAX_ITER_EST_ = 200;
for (int i=0;i<_MAX_ITER_EST_;i++) { for (int i=0;i<_MAX_ITER_EST_;i++) {
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { // if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na; // evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; // return evalMaxApprox;
return evalMaxApprox; // }
}
evalMaxApprox = na; evalMaxApprox = na;
src_n = tmp; src_n = tmp;
} }
assert(0); std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return 0; return evalMaxApprox;
} }
}; };
} }
+76
View File
@@ -0,0 +1,76 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}
@@ -112,7 +112,7 @@ public:
} }
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0); GRID_ASSERT(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
@@ -118,7 +118,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0); // GRID_ASSERT(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -221,7 +221,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0); int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -231,7 +231,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
assert(0); // never reached GRID_ASSERT(0); // never reached
return cp; return cp;
} }
}; };
@@ -66,15 +66,26 @@ public:
Linop(_Linop), Linop(_Linop),
Preconditioner(Prec), Preconditioner(Prec),
mmax(_mmax), mmax(_mmax),
nstep(_nstep) nstep(_nstep) // what is nstep vs mmax? one is the number of inner iterations
{ {
level=1; level=1;
verbose=1; verbose=1;
}; };
// virtual method stubs for updating GCR polynomial
virtual void LogBegin(void){
std::cout << "GCR::LogBegin() "<<std::endl;
};
virtual void LogIteration(int k, ComplexD a, std::vector<ComplexD> betas){
std::cout << "GCR::LogIteration() "<<std::endl;
};
virtual void LogComplete(std::vector<ComplexD>& alphas, std::vector<std::vector<ComplexD>>& betas) {
std::cout << "GCR::LogComplete() "<<std::endl;
};
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi=Zero(); // psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
@@ -96,7 +107,6 @@ public:
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl; GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) { if(cp<rsq) {
SolverTimer.Stop(); SolverTimer.Stop();
Linop.Op(psi,r); Linop.Op(psi,r);
@@ -113,7 +123,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0); // GRID_ASSERT(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -135,9 +145,9 @@ public:
//////////////////////////////// ////////////////////////////////
// history for flexible orthog // history for flexible orthog
//////////////////////////////// ////////////////////////////////
std::vector<Field> q(mmax,grid); std::vector<Field> q(mmax,grid); // q = Ap
std::vector<Field> p(mmax,grid); std::vector<Field> p(mmax,grid); // store mmax conjugate momenta
std::vector<RealD> qq(mmax); std::vector<RealD> qq(mmax); // qq = (Ap)^2 = <p|A^\dagger A |p> (denom of \alpha)
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl; GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
@@ -157,6 +167,8 @@ public:
LinalgTimer.Stop(); LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<< norm2(r) <<std::endl; GCRLogLevel<< "PGCR true residual r = src - A psi "<< norm2(r) <<std::endl;
this->LogBegin(); // initialize polynomial GCR if needed (TODO think about placement of this)
///////////////////// /////////////////////
// p = Prec(r) // p = Prec(r)
///////////////////// /////////////////////
@@ -179,31 +191,44 @@ public:
q[0]= Az; q[0]= Az;
qq[0]= zAAz; qq[0]= zAAz;
std::cout << "||init p - src||: " << norm2(p[0] - src) << std::endl; // for debugging
cp =norm2(r); cp =norm2(r);
LinalgTimer.Stop(); LinalgTimer.Stop();
std::vector<ComplexD> all_alphas;
std::vector<std::vector<ComplexD>> all_betas;
for(int k=0;k<nstep;k++){ for(int k=0;k<nstep;k++){
steps++; steps++;
int kp = k+1; int kp = k+1;
int peri_k = k %mmax; int peri_k = k %mmax; // only store mmax vectors; just roll around if needed
int peri_kp= kp%mmax; int peri_kp= kp%mmax;
// std::cout << "peri_kp = " << peri_kp << std::endl;
LinalgTimer.Start(); LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real? rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k]; a = rq/qq[peri_k]; // compute alpha_j
axpy(psi,a,p[peri_k],psi); all_alphas.push_back(a);
cp = axpy_norm(r,-a,q[peri_k],r); axpy(psi,a,p[peri_k],psi); // update psi --> psi + \alpha p
cp = axpy_norm(r,-a,q[peri_k],r); // update r --> r - \alpha D p. Note q = Dp
LinalgTimer.Stop(); LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl; // LogIterationA(k + 1, a);
if((k==nstep-1)||(cp<rsq)){ GCRLogLevel<< "GCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
return cp;
} // moving this to end of loop so that it doesn't exit beforehand
// TODO if I want to uncomment this, I have to split the LogIteration again and put LogIterationA() beforehand
// if((k==nstep-1)||(cp<rsq)){
// return cp;
// }
PrecTimer.Start(); PrecTimer.Start();
@@ -221,22 +246,205 @@ public:
q[peri_kp]=Az; q[peri_kp]=Az;
p[peri_kp]=z; p[peri_kp]=z;
// Field Dsrc (grid);
// Linop.Op(src, Dsrc);
// std::cout << "||q[peri_kp] - D(src)||: " << norm2(q[peri_kp] - Dsrc) << std::endl; // for debugging
// // delete after testing
// std::cout << "Testing Dsq on one for GCR: " << std::endl;
// Field myField (grid);
// myField = 1.0;
// Field out1 (grid); Field out2 (grid);
// Linop.HermOp(myField, out1);
// Linop.Op(myField, out2);
// std::cout << "Dsq.Hermop(ones) has norm " << norm2(out1) << std::endl;
// std::cout << "Dsq.Op(ones) has norm " << norm2(out2) << std::endl;
// basically northog = k+1 if mmax is large
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
// std::cout << "northog: " << northog << std::endl;
std::vector<ComplexD> betas (northog);
// std::cout << "peri_kp: " << peri_kp << std::endl;
// we iterate backwards counting down from the current k+1 index (peri_kp) because we
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0); int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; // b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
b=-(innerProduct(q[peri_back],Az))/qq[peri_back]; // TODO try complex beta
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back]; q[peri_kp]=q[peri_kp]+b*q[peri_back];
// LogIterationB(peri_back, b);
// betas[back] = b; // may need to change the indexing if I ever do it with restarts
// std::cout << "[DEBUG] pushing beta for back = " << back << ", peri_back = " << peri_back << std::endl;
betas[peri_back] = b; // may need to change the indexing if I ever do it with restarts
} }
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
// log iteration and update GCR polynomial if necessary.
all_betas.push_back(betas);
LogIteration(k + 1, a, betas);
// finish if necessary
if((k==nstep-1)||(cp<rsq)){
std::cout << "All alphas: " << std::endl << all_alphas << std::endl;
std::cout << "All betas: " << std::endl << all_betas << std::endl;
LogComplete(all_alphas, all_betas);
std::cout << "Exiting GCR." << std::endl;
return cp;
} }
assert(0); // never reached
}
GRID_ASSERT(0); // never reached
return cp; return cp;
} }
}; };
class PolynomialFile: Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(PolynomialFile,
std::vector<std::vector<std::complex<double>>>, data,
std::vector<std::vector<std::complex<double>>>, betas,
std::vector<std::complex<double>>, alphas
);
};
// Optionally record the GCR polynomial. [PO]: TODO
template <class Field>
class PGCRPolynomial : public PrecGeneralisedConjugateResidualNonHermitian<Field> {
public:
std::vector<ComplexD> ak;
std::vector<std::vector<ComplexD>> bk;
// std::vector<ComplexD> poly_p;
std::vector<std::vector<ComplexD>> poly_p;
std::vector<ComplexD> poly_Ap; // polynomial in Ap_j (only store it for last p)
std::vector<ComplexD> poly_r;
std::vector<ComplexD> polynomial;
PolynomialFile& PF;
public:
PGCRPolynomial(RealD tol, Integer maxit,LinearOperatorBase<Field> &_Linop, LinearFunction<Field> &Prec, int _mmax, int _nstep, PolynomialFile& _PF)
: PrecGeneralisedConjugateResidualNonHermitian<Field>(tol, maxit, _Linop, Prec, _mmax, _nstep), PF(_PF)
{};
// think this applies the polynomial in A = Linop to a field src. The coeffs are
// stored in the vector `polynomial`.
void PolyOp(const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
this->Linop.Op(tmp,AtoN); // iterate A^n
psi = psi + polynomial[n]*AtoN; // psi += poly_n A^n src
}
}
// [PO TODO] debug this
void PGCRsequence(const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
// Field p(src.Grid());
// p=src;
std::vector<Field> p;
p.push_back(src);
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p[k];
this->Linop.Op(p[k], Ap);
r = r - ak[k] * Ap;
// p[k] = r;
p.push_back(r);
for (int i = 0; i < k; i++) { // [PO TODO] check indices
p[k+1] += bk[i, k+1] * p[i];
}
// p = r + bk[k] * p;
}
}
void Solve(const Field &src, Field &psi)
{
psi=Zero();
this->operator()(src, psi);
}
virtual void LogBegin(void)
{
std::cout << "PGCR::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.push_back(0.0); // start with (0.0); during first iteration should change to (0.0, 1.0)
std::vector<ComplexD> p0_tmp;
p0_tmp.push_back(1.0);
poly_p.push_back(p0_tmp);
poly_r.push_back(1.0);
};
// Updates vector psi and r and initializes vector p[k+1]
virtual void LogIteration(int k, ComplexD a, std::vector<ComplexD> betas){
std::cout << "PGCR::LogIteration(k = " << k << ")" << std::endl;
ak.push_back(a);
bk.push_back(betas);
// update Ap by pushing p[k] to the right
poly_Ap.push_back(0.0); // need to pad the end with an element
poly_Ap[0] = 0.0; // technically this should be unnecessary, as the first component is never set
for(int i = 0; i < k; i++){
poly_Ap[i+1]=poly_p[k-1][i]; // A\vec{p} = (0, \vec{p}) bc A shifts components of p to the right
}
// update psi_{k+1} --> psi_k + a_k p_k
polynomial.push_back(0.0);
for(int i = 0; i < k; i++) {
polynomial[i] += a * poly_p[k-1][i];
}
{
std::vector<std::complex<double>> poly_stdcmplx(polynomial.begin(), polynomial.end());
PF.data.push_back(poly_stdcmplx);
}
// r_{k+1} --> r_k - a_k A p_k
// p_{k+1} --> r_k + \sum_{i=0}^k \beta_{ik} p_i, input betas = (\beta_{ik})_i
poly_r.push_back(0.0); // should be of size k+1 if we start with k = 1
std::vector<ComplexD> p_next (k + 1, ComplexD(0.0)); // p_{k+1} = same size as r_{k+1}
for(int i = 0; i < k + 1; i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i]; // update r_{k+1} --> r_k - \alpha_k A p_k
p_next[i] = poly_r[i]; // init new vector as r_{k+1}
}
// p_{k+1} --> p_{k+1} + \sum_i \beta_{ij} p_i
int nbeta = betas.size();
std::cout << "Betas: " << betas << std::endl;
for (int j = 0; j < nbeta; j++) {
for (int i = 0; i < j+1; i++) {
p_next[i] += betas[j] * poly_p[j][i];
}
}
poly_p.push_back(p_next); // add p_{k+1} to the list of p's
}
virtual void LogComplete(std::vector<ComplexD>& alphas, std::vector<std::vector<ComplexD>>& betas) {
/** Logs all alphas and betas to complete the iterations. */
std::cout << "PGCR::LogComplete() "<<std::endl;
for (int i = 0; i < alphas.size(); i++) {
PF.alphas.push_back(std::complex<double>(alphas[i].real(), alphas[i].imag()));
std::vector<std::complex<double>> beta_stdcmplx(betas[i].begin(), betas[i].end());
PF.betas.push_back(beta_stdcmplx);
}
};
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif
@@ -79,7 +79,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
LinOp.Op(x,r); r = b - r; LinOp.Op(x,r); r = b - r;
assert(normb> 0.0); GRID_ASSERT(normb> 0.0);
resid = norm2(r)/normb; resid = norm2(r)/normb;
if (resid <= Tolerance) { if (resid <= Tolerance) {
@@ -105,8 +105,8 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
for (int i = 1; i <= MaxIterations; i++) { for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests // Breakdown tests
assert( rho != 0.0); GRID_ASSERT( rho != 0.0);
assert( xi != 0.0); GRID_ASSERT( xi != 0.0);
v = (1. / rho) * v_tld; v = (1. / rho) * v_tld;
y = (1. / rho) * y; y = (1. / rho) * y;
@@ -134,10 +134,10 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
ep=Zep.real(); ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl; std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit // Complex Audit
assert(abs(ep)>0); GRID_ASSERT(abs(ep)>0);
beta = ep / delta; beta = ep / delta;
assert(abs(beta)>0); GRID_ASSERT(abs(beta)>0);
v_tld = p_tld - beta * v; v_tld = p_tld - beta * v;
y = v_tld; y = v_tld;
@@ -158,7 +158,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
std::cout << "theta "<<theta<<std::endl; std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl; std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0); GRID_ASSERT(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1); eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
@@ -178,7 +178,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
} }
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl; std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
} }
assert(0); GRID_ASSERT(0);
return; // no convergence return; // no convergence
} }
#else #else
@@ -0,0 +1,753 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./Grid/algorithms/iterative/RestartedLanczosBidiagonalization.h
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_RESTARTED_LANCZOS_BIDIAGONALIZATION_H
#define GRID_RESTARTED_LANCZOS_BIDIAGONALIZATION_H
NAMESPACE_BEGIN(Grid);
/**
* Implicitly Restarted Lanczos Bidiagonalization (IRLBA)
*
* Computes the p largest (or p smallest) singular triplets of a linear
* operator A using the Golub-Kahan-Lanczos bidiagonalization with implicit
* restart via thick-restart / QR shifts.
*
* Algorithm (Baglama & Reichel, SIAM J. Sci. Comput. 27(1):19-42, 2005):
*
* Outer loop:
* 1. Extend the p-step (or seed) bidiagonalization to k steps:
* A V_k = U_k B_k
* A^dag U_k = V_k B_k^T + beta_{k+1} v_{k+1} e_k^T
* 2. Compute SVD: B_k = X Sigma Y^T
* 3. Check convergence of the p desired singular values via
* |beta_{k+1} * y_{k,i}| < tol * sigma_i
* where y_{k,i} is the last component of the i-th right singular vector.
* 4. Apply k-p implicit QR shifts to implicitly compress the basis
* to p steps (Sorensen-Lehoucq thick restart):
* B_p^+ = X_p^T B_k Y_p (upper bidiagonal, p x p)
* and update the lattice vectors:
* V_p^+ = V_k Y_p
* U_p^+ = U_k X_p
* The new residual coupling is
* beta_p^+ v_{p+1}^+ = beta_{k+1} v_{k+1} * (e_k^T Y_p)_p
* + B_k(p,p+1) * (orthogonal tail from QR)
* 5. Go to step 1.
*
* Template parameter
* ------------------
* Field : lattice field type (must support Grid algebra operations)
*
* Usage
* -----
* RestartedLanczosBidiagonalization<Field> irlba(Linop, grid, p, k, tol, maxIter);
* irlba.run(src);
* // Results available via getters.
*/
template <class Field>
class RestartedLanczosBidiagonalization {
public:
LinearOperatorBase<Field> &Linop;
GridBase *Grid;
int Nk; // number of desired singular triplets
int Nm; // Lanczos basis size (Nm > Nk)
RealD Tolerance;
int MaxIter;
bool largest; // if true, target largest singular values; otherwise smallest
// Converged singular triplets (filled after run())
std::vector<RealD> singularValues; // sigma_0 >= sigma_1 >= ...
std::vector<Field> leftVectors; // approximate left singular vectors
std::vector<Field> rightVectors; // approximate right singular vectors
private:
// Working bases (size up to Nm+1)
std::vector<Field> V; // right Lanczos vectors
std::vector<Field> U; // left Lanczos vectors
std::vector<RealD> alpha;
std::vector<RealD> beta;
// After a thick restart, the column at index restart_col of U^dag A V
// has extra non-zero entries (rows 0..restart_col-2) beyond what the
// upper bidiagonal captures. fvec[j] = <U[j] | A V[restart_col]> for
// j = 0..restart_col-1. (fvec[restart_col-1] == beta[restart_col-1].)
// reset_col == -1 means no restart has occurred yet (pure bidiagonal).
std::vector<RealD> fvec;
int restart_col;
public:
RestartedLanczosBidiagonalization(LinearOperatorBase<Field> &_Linop,
GridBase *_Grid,
int _Nk, int _Nm,
RealD _tol = 1.0e-8,
int _maxIt = 300,
bool _largest = true)
: Linop(_Linop), Grid(_Grid),
Nk(_Nk), Nm(_Nm),
Tolerance(_tol), MaxIter(_maxIt),
largest(_largest)
{
assert(Nm > Nk);
}
/**
* Run IRLBA starting from src.
* On exit, singularValues, leftVectors, rightVectors are filled with
* the Nk converged singular triplets.
*/
void run(const Field &src)
{
assert(norm2(src) > 0.0);
singularValues.clear();
leftVectors.clear();
rightVectors.clear();
// Allocate working bases
V.clear(); U.clear();
alpha.clear(); beta.clear();
fvec.clear(); restart_col = -1;
V.reserve(Nm + 1);
U.reserve(Nm);
// Seed: v_0 = src / ||src||
Field vtmp(Grid);
vtmp = src;
RealD nrm = std::sqrt(norm2(vtmp));
vtmp = (1.0 / nrm) * vtmp;
V.push_back(vtmp);
int pStart = 0; // current basis size at start of extension
RealD betaRestart = 0.0; // coupling from previous restart
for (int iter = 0; iter < MaxIter; ++iter) {
// ----------------------------------------------------------------
// Step 1: extend from pStart steps to Nm steps
// ----------------------------------------------------------------
extendBasis(pStart, Nm, betaRestart);
// verify();
// ----------------------------------------------------------------
// Step 2: SVD of the Nm x Nm B matrix.
// iter=0 (pStart==0): B is exactly bidiagonal — use buildBidiagonal.
// iter>0 (pStart==Nk): after a thick restart, column restart_col of
// U^dag A V has extra off-diagonal entries captured by fvec; use
// buildFullB so the Ritz values and restart vectors are computed from
// the exact projected matrix A V = U B_full.
// ----------------------------------------------------------------
Eigen::MatrixXd B = (pStart == 0) ? buildBidiagonal(Nm) : buildFullB(Nm);
Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
Eigen::ComputeThinU | Eigen::ComputeThinV);
Eigen::VectorXd sigma = svd.singularValues(); // descending
Eigen::MatrixXd X = svd.matrixU(); // Nm x Nm left SVecs of B
Eigen::MatrixXd Y = svd.matrixV(); // Nm x Nm right SVecs of B
// If targeting smallest, reorder so desired ones come first
Eigen::VectorXi order = sortOrder(sigma);
// ----------------------------------------------------------------
// Step 3: check convergence of the Nk desired singular values
// ----------------------------------------------------------------
RealD betaK = beta.back(); // beta_{k+1}
// In our convention A V = U B (exact), the residual is in the A^dag
// direction: A^dag u_j - sigma_j v_j = betaK * X[Nm-1,j] * V[Nm].
// Convergence criterion: |betaK * X[Nm-1, idx]| < tol * sigma_idx.
int nconv = 0;
for (int i = 0; i < Nk; ++i) {
int idx = order(i);
RealD res = std::abs(betaK * X(Nm - 1, idx));
RealD thr = Tolerance * std::max(sigma(idx), 1.0e-14);
std::cout << GridLogMessage
<< "IRLBA iter " << iter
<< " sigma[" << i << "] = " << sigma(idx)
<< " res = " << res
<< " thr = " << thr << std::endl;
if (res < thr) ++nconv;
else break; // residuals not strictly ordered but break is conservative
}
if (nconv >= Nk) {
std::cout << GridLogMessage
<< "IRLBA converged: " << nconv << " singular values after "
<< iter + 1 << " iterations." << std::endl;
// Collect converged triplets
extractTriplets(Nm, sigma, X, Y, order, Nk);
return;
}
// ----------------------------------------------------------------
// Step 4: implicit restart — compress to Nk steps
// ----------------------------------------------------------------
implicitRestart(Nm, Nk, sigma, X, Y, order, betaK, betaRestart);
// verify();
// Lucky breakdown: exact invariant subspace found; convergence is exact.
// B_p^+ = diag(alpha[0..Nk-1]); extract directly from restart basis.
if (betaRestart < 1.0e-14) {
std::cout << GridLogMessage
<< "IRLBA: lucky breakdown after restart (betaRestart = 0)."
<< " Extracting " << Nk << " exact Ritz triplets." << std::endl;
// Re-run SVD on the p-step diagonal B^+ to get sorted Ritz triplets.
Eigen::MatrixXd Bp = buildBidiagonal(Nk);
Eigen::JacobiSVD<Eigen::MatrixXd> svdp(Bp,
Eigen::ComputeThinU | Eigen::ComputeThinV);
Eigen::VectorXi ordp = sortOrder(svdp.singularValues());
extractTriplets(Nk, svdp.singularValues(), svdp.matrixU(),
svdp.matrixV(), ordp, Nk);
return;
}
pStart = Nk;
}
std::cout << GridLogMessage
<< "IRLBA: did not converge in " << MaxIter
<< " iterations. Returning best approximations." << std::endl;
// Return best available approximations
Eigen::MatrixXd B = buildFullB((int)alpha.size());
Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
Eigen::ComputeThinU | Eigen::ComputeThinV);
Eigen::VectorXd sigma = svd.singularValues();
Eigen::MatrixXd X = svd.matrixU();
Eigen::MatrixXd Y = svd.matrixV();
Eigen::VectorXi order = sortOrder(sigma);
int nout = std::min(Nk, (int)alpha.size());
extractTriplets((int)alpha.size(), sigma, X, Y, order, nout);
}
/* Getters */
int getNk() const { return (int)singularValues.size(); }
const std::vector<RealD>& getSingularValues() const { return singularValues; }
const std::vector<Field>& getLeftVectors() const { return leftVectors; }
const std::vector<Field>& getRightVectors() const { return rightVectors; }
/**
* Print B_k and U^dag A V to verify the bidiagonalization relation
* A V_m = U_m B_m (exact in our GK convention)
* On the first call (pStart=0), max|B - U^dag A V| should be ~machine epsilon.
* After a restart and extension, the column p of U^dag A V deviates from B
* by O(betaK): this is expected because the thick restart breaks the Krylov
* structure at column p, introducing off-diagonal terms proportional to betaK.
* These terms vanish as betaK -> 0 (convergence), so the algorithm is correct.
*/
void verify()
{
int m = (int)alpha.size();
int nU = (int)U.size();
int nV = (int)V.size();
if (m == 0) { std::cout << GridLogMessage << "IRLBA verify: empty basis" << std::endl; return; }
// Build reference matrix Bref (nU x nV):
// Columns 0..m-1 : buildFullB(m) (bidiagonal + fvec column at restart_col)
// Column m : residual column, two cases:
// (a) restart_col == m (right after implicitRestart, before extendBasis):
// V[m] = sgn*V_old[Nm], so <U[i]|A|V[m]> = fvec[i] for all i
// (b) otherwise (pure GK or after extendBasis):
// only entry (m-1, m) = beta[m-1] (GK recurrence residual)
Eigen::MatrixXd Bref = Eigen::MatrixXd::Zero(nU, nV);
{
Eigen::MatrixXd Bfull = buildFullB(m);
int cols = std::min(m, nV);
Bref.block(0, 0, m, cols) = Bfull.block(0, 0, m, cols);
}
if (nV > m && m > 0) {
if (restart_col == m && (int)fvec.size() == m) {
// Case (a): right after implicitRestart
for (int i = 0; i < m; ++i) Bref(i, m) = fvec[i];
} else if ((int)beta.size() >= m) {
// Case (b): standard GK residual column
Bref(m - 1, m) = beta[m - 1];
}
}
// Compute M[i,j] = <U[i] | A | V[j]>
Eigen::MatrixXd M = Eigen::MatrixXd::Zero(nU, nV);
Field Avj(Grid);
for (int j = 0; j < nV; ++j) {
Linop.Op(V[j], Avj);
for (int i = 0; i < nU; ++i) {
ComplexD ip = innerProduct(U[i], Avj);
M(i, j) = ip.real();
}
}
// Print Bref
std::cout << GridLogMessage
<< "IRLBA verify: Bref (" << nU << "x" << nV << "):" << std::endl;
for (int i = 0; i < nU; ++i) {
std::cout << GridLogMessage << " row " << i << ": ";
for (int j = 0; j < nV; ++j) std::cout << Bref(i,j) << " ";
std::cout << std::endl;
}
// Print U^dag A V
std::cout << GridLogMessage
<< "IRLBA verify: U^dag A V (" << nU << "x" << nV << "):" << std::endl;
for (int i = 0; i < nU; ++i) {
std::cout << GridLogMessage << " row " << i << ": ";
for (int j = 0; j < nV; ++j) std::cout << M(i,j) << " ";
std::cout << std::endl;
}
// Max deviation over the full nU x nV matrix
RealD maxdev = (Bref - M).cwiseAbs().maxCoeff();
std::cout << GridLogMessage
<< "IRLBA verify: max|Bref - U^dag A V| = " << maxdev << std::endl;
// Beta
std::cout << GridLogMessage << "IRLBA verify: beta[0.." << (int)beta.size()-1 << "] = ";
for (auto b : beta) std::cout << b << " ";
std::cout << std::endl;
}
private:
// ------------------------------------------------------------------
// Build the m x m upper-bidiagonal matrix from alpha[0..m-1], beta[0..m-2]
// ------------------------------------------------------------------
Eigen::MatrixXd buildBidiagonal(int m) const
{
Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
for (int k = 0; k < m; ++k) {
B(k, k) = alpha[k];
if (k + 1 < m && k < (int)beta.size())
B(k, k + 1) = beta[k];
}
return B;
}
// ------------------------------------------------------------------
// Build the full m x m B matrix, including the non-bidiagonal column
// at restart_col that arises after a thick restart.
//
// After restart, A V[restart_col] has projections onto all U[0..restart_col-1]
// (not just U[restart_col-1]). These are stored in fvec[0..restart_col-1]
// and make column restart_col of U^dag A V non-bidiagonal.
// ------------------------------------------------------------------
Eigen::MatrixXd buildFullB(int m) const
{
Eigen::MatrixXd B = buildBidiagonal(m);
if (restart_col >= 0 && restart_col < m && (int)fvec.size() > 0) {
for (int j = 0; j < restart_col && j < (int)fvec.size(); ++j){
B(j, restart_col) = fvec[j];
std::cout << GridLogDebug << "buildFullB: B " <<j<<" "<<restart_col<<B(j, restart_col)<<std::endl;
}
}
return B;
}
// ------------------------------------------------------------------
// Return a permutation vector that puts the desired Nk singular values
// first (largest first if largest==true, smallest first otherwise).
// Eigen's JacobiSVD already returns sigma in descending order, so for
// largest we just return 0,1,...,m-1; for smallest we reverse.
// ------------------------------------------------------------------
Eigen::VectorXi sortOrder(const Eigen::VectorXd &sigma) const
{
int m = (int)sigma.size();
Eigen::VectorXi ord(m);
if (largest) {
for (int i = 0; i < m; ++i) ord(i) = i;
} else {
for (int i = 0; i < m; ++i) ord(i) = m - 1 - i;
}
return ord;
}
// ------------------------------------------------------------------
// Extend the Lanczos bidiagonalization from pStart to kEnd steps.
// On first call pStart==0 (V[0] already set).
// On restart calls V[0..pStart], U[0..pStart-1], alpha[0..pStart-1],
// beta[0..pStart-1] are already set; betaRestart is the coupling
// beta_{pStart} that drives the first new U step.
// ------------------------------------------------------------------
void extendBasis(int pStart, int kEnd, RealD betaRestart)
{
// Truncate containers to pStart (Lattice has no default constructor)
if ((int)V.size() > pStart + 1) V.erase(V.begin() + pStart + 1, V.end());
if ((int)U.size() > pStart) U.erase(U.begin() + pStart, U.end());
alpha.resize(pStart);
beta.resize(pStart);
Field p(Grid), r(Grid);
for (int k = pStart; k < kEnd; ++k) {
// p = A v_k
Linop.Op(V[k], p);
// Remove previous left vector coupling
if (k > 0) {
p = p - beta[k - 1] * U[k - 1];
}
// On the first step after a restart, beta[pStart-1] was already set;
// but V[pStart] was already constructed including the beta correction,
// so no extra subtraction needed here beyond the standard recurrence.
// Reorthogonalize p against U, then alpha_k = ||p||, u_k = p/alpha_k
reorthogonalize(p, U);
RealD ak = std::sqrt(norm2(p));
if (ak < 1.0e-14) {
std::cout << GridLogMessage
<< "IRLBA extendBasis: lucky breakdown at step " << k
<< " (alpha = " << ak << ")" << std::endl;
alpha.push_back(ak);
Field zero(Grid); zero = Zero();
U.push_back(zero);
beta.push_back(0.0);
V.push_back(zero);
break;
}
alpha.push_back(ak);
Field u(Grid);
u = (1.0 / ak) * p;
U.push_back(u);
// r = A^dag u_k - alpha_k v_k, reorthogonalize, then beta_{k+1} = ||r||
Linop.AdjOp(U[k], r);
r = r - ak * V[k];
reorthogonalize(r, V);
RealD bk = std::sqrt(norm2(r));
beta.push_back(bk);
std::cout << GridLogMessage
<< "IRLBA extend step " << k
<< " alpha = " << ak
<< " beta = " << bk << std::endl;
// Always push v_{k+1} (needed as residual direction for restart)
if (bk < 1.0e-14) {
std::cout << GridLogMessage
<< "IRLBA extendBasis: lucky breakdown (beta = 0) at step "
<< k << std::endl;
Field zero(Grid); zero = Zero();
V.push_back(zero);
break;
}
Field vnext(Grid);
vnext = (1.0 / bk) * r;
V.push_back(vnext);
if (k == kEnd - 1) break; // v_{k+1} pushed above; stop here
}
}
public:
// ------------------------------------------------------------------
// Block reorthogonalization helpers.
// Declared public because CUDA extended lambdas cannot live inside
// private/protected member functions.
//
// batchInnerProducts: computes c[j] = <basis[j], vec> for all j
// in a single GPU pass (one accelerator_barrier instead of n).
// Queues n pairs of (per-site kernel, reduceKernel) to computeStream
// without intermediate CPU syncs, then syncs once at the end.
//
// batchUpdate: computes vec -= sum_j c[j]*basis[j] in one GPU kernel.
//
// reorthogonalize: two-pass Classical Gram-Schmidt (CGS2) using the
// two helpers above. Each pass costs 2 GPU syncs (1 IP + 1 update)
// instead of 2n syncs per pass in the old sequential MGS.
// ------------------------------------------------------------------
void batchInnerProducts(const Field &vec,
const std::vector<Field> &basis,
std::vector<ComplexD> &c)
{
int n = (int)basis.size();
c.resize(n);
if (n == 0) return;
typedef typename Field::vector_object vobj;
typedef decltype(innerProduct(vobj(), vobj())) inner_t;
typedef decltype(basis[0].View(AcceleratorRead)) View;
GridBase *grid = vec.Grid();
uint64_t oSites = grid->oSites();
uint64_t nsimd = grid->Nsimd();
// all_ip[j * oSites + ss] = per-site inner product of basis[j] and vec at site ss.
// Layout: n contiguous blocks of oSites each.
deviceVector<inner_t> all_ip((uint64_t)n * oSites);
inner_t *all_ip_p = &all_ip[0];
hostVector<View> h_basis_v(n);
deviceVector<View> d_basis_v(n);
for (int j = 0; j < n; ++j) {
h_basis_v[j] = basis[j].View(AcceleratorRead);
acceleratorPut(d_basis_v[j], h_basis_v[j]);
}
View *basis_vp = &d_basis_v[0];
// Queue n per-site kernels to the accelerator stream — no intermediate barriers.
autoView(vec_v, vec, AcceleratorRead);
for (int j = 0; j < n; ++j) {
int jj = j;
uint64_t oSites_ = oSites;
accelerator_for(ss, oSites, nsimd, {
auto x = coalescedRead(basis_vp[jj][ss]);
auto y = coalescedRead(vec_v[ss]);
coalescedWrite(all_ip_p[jj * oSites_ + ss], innerProduct(x, y));
});
}
// ONE sync after all n kernels
accelerator_barrier();
// Copy all per-site results to host
hostVector<inner_t> all_ip_h((uint64_t)n * oSites);
acceleratorCopyFromDevice(all_ip_p, &all_ip_h[0], (uint64_t)n * oSites * sizeof(inner_t));
// Reduce on host: sum over oSites, then collapse SIMD lanes via Reduce(TensorRemove(...))
// TensorRemove strips the iSinglet tensor wrapper to expose the SIMD scalar type.
// Reduce sums all nsimd lanes and returns a plain scalar (RealD or ComplexD).
std::vector<ComplexD> raw(n);
for (int j = 0; j < n; ++j) {
inner_t sum = Zero();
for (uint64_t ss = 0; ss < oSites; ++ss)
sum += all_ip_h[(uint64_t)j * oSites + ss];
raw[j] = ComplexD(Reduce(TensorRemove(sum)));
}
grid->GlobalSumVector(&raw[0], n);
for (int j = 0; j < n; ++j) c[j] = raw[j];
for (int j = 0; j < n; ++j) h_basis_v[j].ViewClose();
}
void batchUpdate(Field &vec,
const std::vector<Field> &basis,
const std::vector<ComplexD> &c)
{
int n = (int)basis.size();
if (n == 0) return;
typedef typename Field::vector_object vobj;
typedef decltype(basis[0].View(AcceleratorRead)) View;
GridBase *grid = vec.Grid();
uint64_t oSites = grid->oSites();
uint64_t nsimd = grid->Nsimd();
// Split complex coefficients into real/imag double arrays on device.
// Using doubles avoids potential ComplexD-device-code compatibility issues.
hostVector<double> h_re(n), h_im(n);
deviceVector<double> d_re(n), d_im(n);
for (int k = 0; k < n; ++k) {
h_re[k] = c[k].real();
h_im[k] = c[k].imag();
}
acceleratorCopyToDevice(&h_re[0], &d_re[0], n * sizeof(double));
acceleratorCopyToDevice(&h_im[0], &d_im[0], n * sizeof(double));
double *re_p = &d_re[0];
double *im_p = &d_im[0];
// Basis views
hostVector<View> h_basis_v(n);
deviceVector<View> d_basis_v(n);
for (int k = 0; k < n; ++k) {
h_basis_v[k] = basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k], h_basis_v[k]);
}
View *basis_vp = &d_basis_v[0];
// Single kernel: vec[ss] -= sum_k (re[k] + i*im[k]) * basis[k][ss]
autoView(vec_v, vec, AcceleratorWrite);
accelerator_for(ss, oSites, nsimd, {
auto v = coalescedRead(vec_v[ss]);
for (int k = 0; k < n; ++k) {
auto b = coalescedRead(basis_vp[k][ss]);
v = v - re_p[k] * b - timesI(im_p[k] * b);
}
coalescedWrite(vec_v[ss], v);
});
for (int k = 0; k < n; ++k) h_basis_v[k].ViewClose();
}
// ------------------------------------------------------------------
// Full reorthogonalization using two-pass Classical Gram-Schmidt (CGS2).
// Each pass calls batchInnerProducts (1 GPU sync) + batchUpdate (1 sync),
// replacing the old 2n GPU syncs per pass from sequential MGS.
// ------------------------------------------------------------------
void reorthogonalize(Field &vec, const std::vector<Field> &basis)
{
if (basis.empty()) return;
std::vector<ComplexD> c;
for (int pass = 0; pass < 2; ++pass) {
batchInnerProducts(vec, basis, c);
batchUpdate(vec, basis, c);
}
}
// ------------------------------------------------------------------
// Implicit restart: given the Nm-step bidiagonalization and its SVD,
// compress to Nk steps via implicit QR shifts applied to B_k.
//
// The "shifts" are the Nm - Nk singular values we want to deflate
// (those NOT in the desired set). We apply them as implicit QR steps
// to the bidiagonal matrix, then update the lattice bases accordingly.
//
// After this call:
// V[0..Nk], U[0..Nk-1], alpha[0..Nk-1], beta[0..Nk-1] are updated.
// betaRestart ← new beta_Nk coupling for the next extension.
// ------------------------------------------------------------------
void implicitRestart(int k, int p,
const Eigen::VectorXd &sigma,
const Eigen::MatrixXd &X,
const Eigen::MatrixXd &Y,
const Eigen::VectorXi &order,
RealD betaK,
RealD &betaRestart)
{
// Thick restart (Baglama & Reichel, Sec. 2.2):
//
// Given B_k = X Sigma Y^T, define the new p-step basis by:
// V^+_i = V_k * y_{order(i)} (right sing. vec. of B_k)
// U^+_i = U_k * x_{order(i)} (left sing. vec. of B_k)
//
// Then A V^+_i = A V_k y_{order(i)} = U_k B_k y_{order(i)}
// = sigma_{order(i)} U_k x_{order(i)} = sigma_{order(i)} U^+_i
//
// So B_p^+ = diag(sigma_{order(0)}, ..., sigma_{order(p-1)}) — DIAGONAL,
// all internal betas are zero.
//
// The residual coupling comes from A^dag U_k = V_k B_k^T + betaK V[k] e_{k-1}^T:
// A^dag U^+_{p-1} - sigma_{order(p-1)} V^+_{p-1}
// = V_k (B_k^T x_{order(p-1)} - sigma_{order(p-1)} y_{order(p-1)})
// + betaK * X(k-1, order(p-1)) * V[k]
// = betaK * X(k-1, order(p-1)) * V[k] (since B_k^T x_j = sigma_j y_j)
//
// Therefore: betaRestart = |betaK * X(k-1, order(p-1))|
// V[p] = sign(X(k-1, order(p-1))) * V[k]
// ---- Build new lattice vectors ----
std::vector<Field> Vnew, Unew;
Vnew.reserve(p + 1);
Unew.reserve(p);
for (int i = 0; i < p; ++i) {
int idx = order(i);
Field vi(Grid); vi = Zero();
for (int j = 0; j < k; ++j)
vi = vi + Y(j, idx) * V[j];
Vnew.push_back(vi);
}
for (int i = 0; i < p; ++i) {
int idx = order(i);
Field ui(Grid); ui = Zero();
for (int j = 0; j < k; ++j)
ui = ui + X(j, idx) * U[j];
Unew.push_back(ui);
}
// New v_{p} (0-indexed: V[p]) = sign * V[k]
// From A^dag U_k = V_k B_k^T + betaK V[k] e_{k-1}^T:
// A^dag U^+_j - sigma_j V^+_j = betaK * X(k-1, order(j)) * V[k]
// The last Ritz pair (j=p-1) defines betaRestart and the sign of V[p].
// All p couplings (j=0..p-1) are stored in fvec so that buildFullB can
// reconstruct the exact column p of U^dag A V after the next extension.
RealD coeff = betaK * X(k - 1, order(p - 1));
betaRestart = std::abs(coeff);
RealD sgn = (coeff >= 0.0) ? 1.0 : -1.0;
fvec.resize(p);
for (int j = 0; j < p; ++j)
fvec[j] = betaK * X(k - 1, order(j)) * sgn;
// fvec[p-1] == betaRestart by construction
restart_col = p;
Field vp(Grid);
if (betaRestart > 1.0e-14) {
vp = sgn * V[k];
} else {
betaRestart = 0.0;
vp = Zero();
}
Vnew.push_back(vp); // V[p]
// ---- New alpha, beta ----
// B_p^+ is diagonal: alpha^+_i = sigma_{order(i)}, all internal beta = 0
std::vector<RealD> alpha_new(p), beta_new(p);
for (int i = 0; i < p; ++i) alpha_new[i] = sigma(order(i));
for (int i = 0; i < p - 1; ++i) beta_new[i] = 0.0;
beta_new[p - 1] = betaRestart;
// ---- Commit new state ----
V = Vnew;
U = Unew;
alpha = alpha_new;
beta = beta_new;
std::cout << GridLogMessage
<< "IRLBA restart: compressed to " << p << " steps,"
<< " new beta_p = " << betaRestart << std::endl;
}
// ------------------------------------------------------------------
// Extract the desired singular triplets into the public output vectors.
// ------------------------------------------------------------------
void extractTriplets(int m,
const Eigen::VectorXd &sigma,
const Eigen::MatrixXd &X,
const Eigen::MatrixXd &Y,
const Eigen::VectorXi &order,
int nout)
{
singularValues.resize(nout);
leftVectors.clear(); leftVectors.reserve(nout);
rightVectors.clear(); rightVectors.reserve(nout);
for (int i = 0; i < nout; ++i) {
int idx = order(i);
singularValues[i] = sigma(idx);
// Left singular vector of A: svec_L = U_m * x_i
Field svL(Grid); svL = Zero();
for (int j = 0; j < m && j < (int)U.size(); ++j)
svL = svL + X(j, idx) * U[j];
leftVectors.push_back(svL);
// Right singular vector of A: svec_R = V_m * y_i
Field svR(Grid); svR = Zero();
for (int j = 0; j < m && j < (int)V.size(); ++j)
svR = svR + Y(j, idx) * V[j];
rightVectors.push_back(svR);
}
}
};
NAMESPACE_END(Grid);
#endif
+54 -54
View File
@@ -327,9 +327,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e) // src_o = (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
} }
@@ -347,17 +347,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even); src_e = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -396,13 +396,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -416,17 +416,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even); src_e_i = src_e-tmp; GRID_ASSERT( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e_i,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -461,9 +461,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd ); src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -478,18 +478,18 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even ); src_e_i = src_e - tmp; GRID_ASSERT( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(src_e_i, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd ); setCheckerboard(sol, sol_o); GRID_ASSERT( sol_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{ {
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix); NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd); this->_HermitianRBSolver(_OpEO, src_o, sol_o); GRID_ASSERT(sol_o.Checkerboard() == Odd);
} }
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
@@ -539,13 +539,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp; Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd); _Matrix.MooeeInv(Mtmp,tmp); GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -560,12 +560,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -612,12 +612,12 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -638,12 +638,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o_i,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -684,9 +684,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd ); src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -707,12 +707,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o_i, tmp); GRID_ASSERT( tmp.Checkerboard() == Even );
tmp = src_e - tmp; assert( src_e.Checkerboard() == Even ); tmp = src_e - tmp; GRID_ASSERT( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(tmp, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd ); setCheckerboard(sol, sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() == Odd );
}; };
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
+931
View File
@@ -0,0 +1,931 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LANC_H
#define GRID_LANC_H
#include <string.h> //memset
#ifdef USE_LAPACK
#ifdef USE_MKL
#include<mkl_lapack.h>
#else
void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
double *vl, double *vu, int *il, int *iu, double *abstol,
int *m, double *w, double *z, int *ldz, int *isuppz,
double *work, int *lwork, int *iwork, int *liwork,
int *info);
//#include <lapacke/lapacke.h>
#endif
#endif
//#include <Grid/algorithms/densematrix/DenseMatrix.h>
// eliminate temorary vector in calc()
#define MEM_SAVE
namespace Grid
{
struct Bisection
{
#if 0
static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
std::vector < RealD > &BETA,
std::vector < RealD > &eig)
{
int i, j;
std::vector < RealD > evec1 (row_num + 3);
std::vector < RealD > evec2 (row_num + 3);
RealD eps2;
ALPHA[1] = 0.;
BETHA[1] = 0.;
for (i = 0; i < row_num - 1; i++)
{
ALPHA[i + 1] = A[i * (row_num + 1)].real ();
BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
}
ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
// Do we really need to sort here?
int begin = 1;
int end = row_num;
int swapped = 1;
while (swapped)
{
swapped = 0;
for (i = begin; i < end; i++)
{
if (mag (evec2[i]) > mag (evec2[i + 1]))
{
swap (evec2 + i, evec2 + i + 1);
swapped = 1;
}
}
end--;
for (i = end - 1; i >= begin; i--)
{
if (mag (evec2[i]) > mag (evec2[i + 1]))
{
swap (evec2 + i, evec2 + i + 1);
swapped = 1;
}
}
begin++;
}
for (i = 0; i < row_num; i++)
{
for (j = 0; j < row_num; j++)
{
if (i == j)
H[i * row_num + j] = evec2[i + 1];
else
H[i * row_num + j] = 0.;
}
}
}
#endif
static void bisec (std::vector < RealD > &c,
std::vector < RealD > &b,
int n,
int m1,
int m2,
RealD eps1,
RealD relfeh, std::vector < RealD > &x, RealD & eps2)
{
std::vector < RealD > wu (n + 2);
RealD h, q, x1, xu, x0, xmin, xmax;
int i, a, k;
b[1] = 0.0;
xmin = c[n] - fabs (b[n]);
xmax = c[n] + fabs (b[n]);
for (i = 1; i < n; i++)
{
h = fabs (b[i]) + fabs (b[i + 1]);
if (c[i] + h > xmax)
xmax = c[i] + h;
if (c[i] - h < xmin)
xmin = c[i] - h;
}
xmax *= 2.;
eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
if (eps1 <= 0.0)
eps1 = eps2;
eps2 = 0.5 * eps1 + 7.0 * (eps2);
x0 = xmax;
for (i = m1; i <= m2; i++)
{
x[i] = xmax;
wu[i] = xmin;
}
for (k = m2; k >= m1; k--)
{
xu = xmin;
i = k;
do
{
if (xu < wu[i])
{
xu = wu[i];
i = m1 - 1;
}
i--;
}
while (i >= m1);
if (x0 > x[k])
x0 = x[k];
while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
{
x1 = (xu + x0) / 2;
a = 0;
q = 1.0;
for (i = 1; i <= n; i++)
{
q =
c[i] - x1 -
((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
if (q < 0)
a++;
}
// printf("x1=%0.14e a=%d\n",x1,a);
if (a < k)
{
if (a < m1)
{
xu = x1;
wu[m1] = x1;
}
else
{
xu = x1;
wu[a + 1] = x1;
if (x[a] > x1)
x[a] = x1;
}
}
else
x0 = x1;
}
printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
x[k] = (x0 + xu) / 2;
}
}
};
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
template < class Field > class SimpleLanczos
{
const RealD small = 1.0e-16;
public:
int lock;
int get;
int Niter;
int converged;
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
int Np; // Np -- Number of spare vecs in kryloc space
int Nm; // Nm -- total number of vectors
RealD OrthoTime;
RealD eresid;
// SortEigen < Field > _sort;
LinearFunction < Field > &_Linop;
// OperatorFunction < Field > &_poly;
/////////////////////////
// Constructor
/////////////////////////
void init (void)
{
};
// void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector < RealD > >&evecs);
SimpleLanczos (LinearFunction < Field > &Linop, // op
// OperatorFunction < Field > &poly, // polynmial
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _Niter): // Max iterations
_Linop (Linop),
// _poly (poly),
Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
{
Np = Nm - Nk;
assert (Np > 0);
};
/////////////////////////
// Sanity checked this routine (step) against Saad.
/////////////////////////
void RitzMatrix (std::vector < Field > &evec, int k)
{
if (1)
return;
GridBase *grid = evec[0].Grid();
Field w (grid);
std::cout << GridLogMessage << "RitzMatrix " << std::endl;
for (int i = 0; i < k; i++)
{
_Linop(evec[i], w);
// _poly(_Linop,evec[i],w);
std::cout << GridLogMessage << "[" << i << "] ";
for (int j = 0; j < k; j++)
{
ComplexD in = innerProduct (evec[j], w);
if (fabs ((double) i - j) > 1)
{
if (abs (in) > 1.0e-9)
{
std::cout << GridLogMessage << "oops" << std::endl;
abort ();
}
else
std::cout << GridLogMessage << " 0 ";
}
else
{
std::cout << GridLogMessage << " " << in << " ";
}
}
std::cout << GridLogMessage << std::endl;
}
}
void step (std::vector < RealD > &lmd,
std::vector < RealD > &lme,
Field & last, Field & current, Field & next, uint64_t k)
{
if (lmd.size () <= k)
lmd.resize (k + Nm);
if (lme.size () <= k)
lme.resize (k + Nm);
// _poly(_Linop,current,next ); // 3. wk:=Avk−βkv_{k1}
_Linop(current, next); // 3. wk:=Avk−βkv_{k1}
if (k > 0)
{
next -= lme[k - 1] * last;
}
// std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
ComplexD zalph = innerProduct (current, next); // 4. αk:=(wk,vk)
RealD alph = real (zalph);
next = next - alph * current; // 5. wk:=wk−αkvk
// std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
RealD beta = normalise (next); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
// norm=beta;
int interval = Nm / 100 + 1;
if ((k % interval) == 0)
std::
cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
beta << std::endl;
const RealD tiny = 1.0e-20;
if (beta < tiny)
{
std::cout << GridLogMessage << " beta is tiny " << beta << std::
endl;
}
lmd[k] = alph;
lme[k] = beta;
}
void qr_decomp (std::vector < RealD > &lmd,
std::vector < RealD > &lme,
int Nk,
int Nm,
std::vector < RealD > &Qt, RealD Dsh, int kmin, int kmax)
{
int k = kmin - 1;
RealD x;
RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
RealD c = (lmd[k] - Dsh) * Fden;
RealD s = -lme[k] * Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k + 1];
RealD tmpb = lme[k];
lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
x = -s * lme[k + 1];
lme[k + 1] = c * lme[k + 1];
for (int i = 0; i < Nk; ++i)
{
RealD Qtmp1 = Qt[i + Nm * k];
RealD Qtmp2 = Qt[i + Nm * (k + 1)];
Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
}
// Givens transformations
for (int k = kmin; k < kmax - 1; ++k)
{
RealD Fden = 1.0 / hypot (x, lme[k - 1]);
RealD c = lme[k - 1] * Fden;
RealD s = -x * Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k + 1];
RealD tmpb = lme[k];
lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
lme[k - 1] = c * lme[k - 1] - s * x;
if (k != kmax - 2)
{
x = -s * lme[k + 1];
lme[k + 1] = c * lme[k + 1];
}
for (int i = 0; i < Nk; ++i)
{
RealD Qtmp1 = Qt[i + Nm * k];
RealD Qtmp2 = Qt[i + Nm * (k + 1)];
Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
}
}
}
#if 0
#ifdef USE_LAPACK
#ifdef USE_MKL
#define LAPACK_INT MKL_INT
#else
#define LAPACK_INT long long
#endif
void diagonalize_lapack (std::vector < RealD > &lmd, std::vector < RealD > &lme, int N1, // all
int N2, // get
GridBase * grid)
{
const int size = Nm;
LAPACK_INT NN = N1;
double evals_tmp[NN];
double DD[NN];
double EE[NN];
for (int i = 0; i < NN; i++)
for (int j = i - 1; j <= i + 1; j++)
if (j < NN && j >= 0)
{
if (i == j)
DD[i] = lmd[i];
if (i == j)
evals_tmp[i] = lmd[i];
if (j == (i - 1))
EE[j] = lme[j];
}
LAPACK_INT evals_found;
LAPACK_INT lwork =
((18 * NN) >
(1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
LAPACK_INT liwork = 3 + NN * 10;
LAPACK_INT iwork[liwork];
double work[lwork];
LAPACK_INT isuppz[2 * NN];
char jobz = 'N'; // calculate evals only
char range = 'I'; // calculate il-th to iu-th evals
// char range = 'A'; // calculate all evals
char uplo = 'U'; // refer to upper half of original matrix
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
int ifail[NN];
LAPACK_INT info;
// int total = QMP_get_number_of_nodes();
// int node = QMP_get_node_number();
// GridBase *grid = evec[0]._grid;
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (NN / total) + 1;
double vl = 0.0, vu = 0.0;
LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
if (iu > NN)
iu = NN;
double tol = 0.0;
if (1)
{
memset (evals_tmp, 0, sizeof (double) * NN);
if (il <= NN)
{
printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
#ifdef USE_MKL
dstegr (&jobz, &range, &NN,
#else
LAPACK_dstegr (&jobz, &range, &NN,
#endif
(double *) DD, (double *) EE, &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
&tol, // tolerance
&evals_found, evals_tmp, (double *) NULL, &NN,
isuppz, work, &lwork, iwork, &liwork, &info);
for (int i = iu - 1; i >= il - 1; i--)
{
printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
evals_tmp[i] = evals_tmp[i - (il - 1)];
if (il > 1)
evals_tmp[i - (il - 1)] = 0.;
}
}
{
grid->GlobalSumVector (evals_tmp, NN);
}
}
// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
}
#undef LAPACK_INT
#endif
void diagonalize (std::vector < RealD > &lmd,
std::vector < RealD > &lme,
int N2, int N1, GridBase * grid)
{
#ifdef USE_LAPACK
const int check_lapack = 0; // just use lapack if 0, check against lapack if 1
if (!check_lapack)
return diagonalize_lapack (lmd, lme, N2, N1, grid);
// diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
#endif
}
#endif
static RealD normalise (Field & v)
{
RealD nn = norm2 (v);
nn = sqrt (nn);
v = v * (1.0 / nn);
return nn;
}
void orthogonalize (Field & w, std::vector < Field > &evec, int k)
{
double t0 = -usecond () / 1e6;
typedef typename Field::scalar_type MyComplex;
MyComplex ip;
if (0)
{
for (int j = 0; j < k; ++j)
{
normalise (evec[j]);
for (int i = 0; i < j; i++)
{
ip = innerProduct (evec[i], evec[j]); // are the evecs normalised? ; this assumes so.
evec[j] = evec[j] - ip * evec[i];
}
}
}
for (int j = 0; j < k; ++j)
{
ip = innerProduct (evec[j], w); // are the evecs normalised? ; this assumes so.
w = w - ip * evec[j];
}
normalise (w);
t0 += usecond () / 1e6;
OrthoTime += t0;
}
void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
{
for (int i = 0; i < Qt.size (); ++i)
Qt[i] = 0.0;
for (int k = 0; k < Nm; ++k)
Qt[k + k * Nm] = 1.0;
}
void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
{
GridBase *grid = src.Grid();
// assert(grid == src._grid);
std::
cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
endl;
std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
std::cout << GridLogMessage << " -- size of eval = " << eval.
size () << std::endl;
// assert(c.size() && Nm == eval.size());
std::vector < RealD > lme (Nm);
std::vector < RealD > lmd (Nm);
Field current (grid);
Field last (grid);
Field next (grid);
Nconv = 0;
RealD beta_k;
// Set initial vector
// (uniform vector) Why not src??
// evec[0] = 1.0;
current = src;
std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
endl;
normalise (current);
std::
cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
std::endl;
// Initial Nk steps
OrthoTime = 0.;
double t0 = usecond () / 1e6;
RealD norm; // sqrt norm of last vector
uint64_t iter = 0;
bool initted = false;
std::vector < RealD > low (Nstop * 10);
std::vector < RealD > high (Nstop * 10);
RealD cont = 0.;
while (1) {
cont = 0.;
std::vector < RealD > lme2 (Nm);
std::vector < RealD > lmd2 (Nm);
for (uint64_t k = 0; k < Nm; ++k, iter++) {
step (lmd, lme, last, current, next, iter);
last = current;
current = next;
}
double t1 = usecond () / 1e6;
std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
t0 << "seconds" << std::endl;
t0 = t1;
std::
cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
OrthoTime << "seconds" << std::endl;
// getting eigenvalues
lmd2.resize (iter + 2);
lme2.resize (iter + 2);
for (uint64_t k = 0; k < iter; ++k) {
lmd2[k + 1] = lmd[k];
lme2[k + 2] = lme[k];
}
t1 = usecond () / 1e6;
std::cout << GridLogMessage << "IRL:: copy: " << t1 -
t0 << "seconds" << std::endl;
t0 = t1;
{
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (Nstop / total) + 1;
int iu = (iter + 1) - (interval * node + 1);
int il = (iter + 1) - (interval * (node + 1));
std::vector < RealD > eval2 (iter + 3);
RealD eps2;
Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
eps2);
// diagonalize(eval2,lme2,iter,Nk,grid);
RealD diff = 0.;
for (int i = il; i <= iu; i++) {
if (initted)
diff =
fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
fabs (high[iu-i]));
if (initted && (diff > eresid))
cont = 1.;
if (initted)
printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
high[iu-i], diff);
high[iu-i] = eval2[i];
}
il = (interval * node + 1);
iu = (interval * (node + 1));
Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
eps2);
for (int i = il; i <= iu; i++) {
if (initted)
diff =
fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
fabs (low[i]));
if (initted && (diff > eresid))
cont = 1.;
if (initted)
printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
low[i], diff);
low[i] = eval2[i];
}
t1 = usecond () / 1e6;
std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
t0 << "seconds" << std::endl;
t0 = t1;
}
for (uint64_t k = 0; k < Nk; ++k) {
// eval[k] = eval2[k];
}
if (initted)
{
grid->GlobalSumVector (&cont, 1);
if (cont < 1.) return;
}
initted = true;
}
}
#if 0
/**
There is some matrix Q such that for any vector y
Q.e_1 = y and Q is unitary.
**/
template < class T >
static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
{
int N = y.size (); //Matrix Size
Fill (Q, 0.0);
T tau;
for (int i = 0; i < N; i++)
{
Q[i][0] = y[i];
}
T sig = conj (y[0]) * y[0];
T tau0 = fabs (sqrt (sig));
for (int j = 1; j < N; j++)
{
sig += conj (y[j]) * y[j];
tau = abs (sqrt (sig));
if (abs (tau0) > 0.0)
{
T gam = conj ((y[j] / tau) / tau0);
for (int k = 0; k <= j - 1; k++)
{
Q[k][j] = -gam * y[k];
}
Q[j][j] = tau0 / tau;
}
else
{
Q[j - 1][j] = 1.0;
}
tau0 = tau;
}
return tau;
}
/**
There is some matrix Q such that for any vector y
Q.e_k = y and Q is unitary.
**/
template < class T >
static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
{
T tau = orthQ (Q, y);
SL (Q);
return tau;
}
/**
Wind up with a matrix with the first con rows untouched
say con = 2
Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
and the matrix is upper hessenberg
and with f and Q appropriately modidied with Q is the arnoldi factorization
**/
template < class T > static void Lock (DenseMatrix < T > &H, ///Hess mtx
DenseMatrix < T > &Q, ///Lock Transform
T val, ///value to be locked
int con, ///number already locked
RealD small, int dfg, bool herm)
{
//ForceTridiagonal(H);
int M = H.dim;
DenseVector < T > vec;
Resize (vec, M - con);
DenseMatrix < T > AH;
Resize (AH, M - con, M - con);
AH = GetSubMtx (H, con, M, con, M);
DenseMatrix < T > QQ;
Resize (QQ, M - con, M - con);
Unity (Q);
Unity (QQ);
DenseVector < T > evals;
Resize (evals, M - con);
DenseMatrix < T > evecs;
Resize (evecs, M - con, M - con);
Wilkinson < T > (AH, evals, evecs, small);
int k = 0;
RealD cold = abs (val - evals[k]);
for (int i = 1; i < M - con; i++)
{
RealD cnew = abs (val - evals[i]);
if (cnew < cold)
{
k = i;
cold = cnew;
}
}
vec = evecs[k];
ComplexD tau;
orthQ (QQ, vec);
//orthQM(QQ,AH,vec);
AH = Hermitian (QQ) * AH;
AH = AH * QQ;
for (int i = con; i < M; i++)
{
for (int j = con; j < M; j++)
{
Q[i][j] = QQ[i - con][j - con];
H[i][j] = AH[i - con][j - con];
}
}
for (int j = M - 1; j > con + 2; j--)
{
DenseMatrix < T > U;
Resize (U, j - 1 - con, j - 1 - con);
DenseVector < T > z;
Resize (z, j - 1 - con);
T nm = norm (z);
for (int k = con + 0; k < j - 1; k++)
{
z[k - con] = conj (H (j, k + 1));
}
normalise (z);
RealD tmp = 0;
for (int i = 0; i < z.size () - 1; i++)
{
tmp = tmp + abs (z[i]);
}
if (tmp < small / ((RealD) z.size () - 1.0))
{
continue;
}
tau = orthU (U, z);
DenseMatrix < T > Hb;
Resize (Hb, j - 1 - con, M);
for (int a = 0; a < M; a++)
{
for (int b = 0; b < j - 1 - con; b++)
{
T sum = 0;
for (int c = 0; c < j - 1 - con; c++)
{
sum += H[a][con + 1 + c] * U[c][b];
} //sum += H(a,con+1+c)*U(c,b);}
Hb[b][a] = sum;
}
}
for (int k = con + 1; k < j; k++)
{
for (int l = 0; l < M; l++)
{
H[l][k] = Hb[k - 1 - con][l];
}
} //H(Hb[k-1-con][l] , l,k);}}
DenseMatrix < T > Qb;
Resize (Qb, M, M);
for (int a = 0; a < M; a++)
{
for (int b = 0; b < j - 1 - con; b++)
{
T sum = 0;
for (int c = 0; c < j - 1 - con; c++)
{
sum += Q[a][con + 1 + c] * U[c][b];
} //sum += Q(a,con+1+c)*U(c,b);}
Qb[b][a] = sum;
}
}
for (int k = con + 1; k < j; k++)
{
for (int l = 0; l < M; l++)
{
Q[l][k] = Qb[k - 1 - con][l];
}
} //Q(Qb[k-1-con][l] , l,k);}}
DenseMatrix < T > Hc;
Resize (Hc, M, M);
for (int a = 0; a < j - 1 - con; a++)
{
for (int b = 0; b < M; b++)
{
T sum = 0;
for (int c = 0; c < j - 1 - con; c++)
{
sum += conj (U[c][a]) * H[con + 1 + c][b];
} //sum += conj( U(c,a) )*H(con+1+c,b);}
Hc[b][a] = sum;
}
}
for (int k = 0; k < M; k++)
{
for (int l = con + 1; l < j; l++)
{
H[l][k] = Hc[k][l - 1 - con];
}
} //H(Hc[k][l-1-con] , l,k);}}
}
}
#endif
};
}
#endif
+138 -5
View File
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x) inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:
RealD scale; RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false); ConjugateGradient<FineField> CG(1.0e-4,2000,false);
FineField noise(FineGrid); FineField noise(FineGrid);
FineField Mn(FineGrid); FineField Mn(FineGrid);
@@ -108,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl; hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){ for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]); CG(hermop,noise,subspace[b]);
@@ -124,6 +126,56 @@ public:
} }
} }
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
// PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,30,30);
// PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,12,12);
// PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,10,10);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<3;i++){
// void operator() (const Field &src, Field &psi){
#if 1
if (i==0)std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
if (i==0)std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<" <f|OpDagOp|f>"<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
@@ -160,14 +212,21 @@ public:
int b =0; int b =0;
{ {
ComplexD ip;
// Filter // Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter); Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn); Cheb(hermop,noise,Mn);
// normalise // normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -213,8 +272,18 @@ public:
Mn=*Tnp; Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -226,8 +295,72 @@ public:
} }
} }
assert(b==nn); GRID_ASSERT(b==nn);
} }
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,
+51 -28
View File
@@ -99,7 +99,7 @@ public:
CoarseMatrix AselfInvEven; CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd; CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor; deviceVector<RealD> dag_factor;
/////////////////////// ///////////////////////
// Interface // Interface
@@ -124,9 +124,13 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -161,7 +165,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
}; };
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -190,9 +194,14 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -201,10 +210,10 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0); deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) for(int p=0; p<geom.npoint; p++) {
points[p] = geom.points_dagger[p]; acceleratorPut(points[p],geom.points_dagger[p]);
}
auto points_p = &points[0]; auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0]; RealD* dag_factor_p = &dag_factor[0];
@@ -236,7 +245,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@@ -251,8 +260,14 @@ public:
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
@@ -285,7 +300,7 @@ public:
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@@ -294,7 +309,7 @@ public:
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
std::cout <<"MdirAll out size "<< out.size()<<std::endl; std::cout <<"MdirAll out size "<< out.size()<<std::endl;
std::cout <<"MdirAll ndir "<< ndir<<std::endl; std::cout <<"MdirAll ndir "<< ndir<<std::endl;
assert(0); GRID_ASSERT(0);
} }
for(int p=0;p<ndir;p++){ for(int p=0;p<ndir;p++){
MdirCalc(in,out[p],p); MdirCalc(in,out[p],p);
@@ -358,7 +373,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even); GRID_ASSERT(in.Checkerboard() == Even);
out.Checkerboard() = Odd; out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag); DhopInternal(StencilEven, Aodd, in, out, dag);
@@ -368,7 +383,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd); GRID_ASSERT(in.Checkerboard() == Odd);
out.Checkerboard() = Even; out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag); DhopInternal(StencilOdd, Aeven, in, out, dag);
@@ -376,7 +391,7 @@ public:
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr; CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) { if(in.Grid()->_isCheckerBoarded) {
@@ -391,7 +406,7 @@ public:
Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag); DselfInternal(Stencil, *Aself, in, out, dag);
} }
assert(Aself != nullptr); GRID_ASSERT(Aself != nullptr);
} }
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a, void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
@@ -469,14 +484,20 @@ public:
// determine in what order we need the points // determine in what order we need the points
int npoint = geom.npoint-1; int npoint = geom.npoint-1;
Vector<int> points(npoint, 0); deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) for(int p=0; p<npoint; p++) {
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p; int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0]; auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer; deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead)); hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -539,7 +560,7 @@ public:
}); });
} }
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
} }
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@@ -590,11 +611,13 @@ public:
} }
// GPU readable prefactor // GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, { thread_for(i, nbasis*nbasis, {
int j = i/nbasis; int j = i/nbasis;
int k = i%nbasis; int k = i%nbasis;
dag_factor[i] = dag_factor_eigen(j, k); h_dag_factor[i] = dag_factor_eigen(j, k);
}); });
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
} }
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -674,7 +697,7 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero); evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask; oddmask = one-evenmask;
assert(self_stencil!=-1); GRID_ASSERT(self_stencil!=-1);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
@@ -99,7 +99,7 @@ public:
} }
} }
} }
assert(nfound==geom.npoint); GRID_ASSERT(nfound==geom.npoint);
ExchangeCoarseLinks(); ExchangeCoarseLinks();
} }
*/ */
@@ -124,7 +124,7 @@ public:
} }
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
{ {
assert(hermitian); GRID_ASSERT(hermitian);
Mult(_A,in,out); Mult(_A,in,out);
// if ( hermitian ) M(in,out); // if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out); // else Mult(_Adag,in,out);
@@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl; std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
} }
#else #else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace) Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{ {
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid(); GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid()); CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace); blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint; const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions(); Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl; std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond(); tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i]; phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond(); tphaseBZ+=usecond();
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl; // std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond(); tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace); blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner; coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner; ComputeProj[p] = coarseInner;
@@ -609,7 +619,7 @@ public:
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]); // _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
} }
} }
virtual void Mdiag (const Field &in, Field &out){ assert(0);}; virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);}; virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
}; };
@@ -80,12 +80,12 @@ public:
// Can be used to do I/O on the operator matrices externally // Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A) void SetMatrix (int p,CoarseMatrix & A)
{ {
assert(A.size()==geom_srhs.npoint); GRID_ASSERT(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]); GridtoBLAS(A[p],BLAS_A[p]);
} }
void GetMatrix (int p,CoarseMatrix & A) void GetMatrix (int p,CoarseMatrix & A)
{ {
assert(A.size()==geom_srhs.npoint); GRID_ASSERT(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]); BLAStoGrid(A[p],BLAS_A[p]);
} }
void CopyMatrix (GeneralCoarseOp &_Op) void CopyMatrix (GeneralCoarseOp &_Op)
@@ -178,14 +178,14 @@ public:
for(int32_t point = 0 ; point < geom.npoint; point++){ for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point; int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
assert(nbr<BLAS_B.size()); GRID_ASSERT(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr]; ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
} }
j++; j++;
} }
} }
assert(j==unpadded_sites); GRID_ASSERT(j==unpadded_sites);
} }
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to) template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{ {
@@ -194,7 +194,7 @@ public:
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid(); GridBase *Fg = from.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension; int nd = Fg->_ndimension;
to.resize(Fg->lSites()); to.resize(Fg->lSites());
@@ -241,10 +241,10 @@ public:
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid(); GridBase *Tg = grid.Grid();
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension; int nd = Tg->_ndimension;
assert(in.size()==Tg->lSites()); GRID_ASSERT(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions(); Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1; size_t nsite = 1;
@@ -669,7 +669,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]; int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
assert(nrhs>=1); GRID_ASSERT(nrhs>=1);
RealD flops,bytes; RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded int64_t osites=in.Grid()->oSites(); // unpadded
@@ -721,7 +721,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl; // std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl; // std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
}; };
virtual void Mdiag (const Field &in, Field &out){ assert(0);}; virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);}; virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
}; };
+3 -3
View File
@@ -67,8 +67,8 @@ public:
} }
int point(int dir, int disp) { int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1); GRID_ASSERT(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4); GRID_ASSERT(base+0 <= dir && dir < base+4);
// directions faster index = new indexing // directions faster index = new indexing
// 4d (base = 0): // 4d (base = 0):
@@ -131,7 +131,7 @@ public:
return p; return p;
} }
} }
assert(0); GRID_ASSERT(0);
return -1; return -1;
} }
void BuildShifts(void) void BuildShifts(void)
+11 -29
View File
@@ -57,7 +57,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -69,7 +69,7 @@ public:
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
@@ -106,7 +106,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -154,7 +154,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -174,19 +174,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
// Cshift on device template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using cshiftAllocator = devAllocator<T>; template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
#else template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
/* /*
template<class T> class vecView template<class T> class vecView
@@ -197,8 +188,9 @@ template<class T> class vecView
ViewMode mode; ViewMode mode;
void * cpu_ptr; void * cpu_ptr;
public: public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(std::vector<T> &refer_to_me,ViewMode _mode) vecView(Vector<T> &refer_to_me,ViewMode _mode)
{ {
cpu_ptr = &refer_to_me[0]; cpu_ptr = &refer_to_me[0];
size = refer_to_me.size(); size = refer_to_me.size();
@@ -214,22 +206,12 @@ template<class T> class vecView
} }
}; };
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode) template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{ {
vecView<T> ret(vec,_mode); // does the open vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed return ret; // must be closed
} }
// Little autoscope assister
template<class View>
class VectorViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
VectorViewCloser(View &_v) : v(_v) {};
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
};
#define autoVecView(v_v,v,mode) \ #define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \ auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v); ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+2 -2
View File
@@ -292,7 +292,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); GRID_ASSERT(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr; if (ncache == 0) return ptr;
@@ -345,7 +345,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); GRID_ASSERT(omp_in_parallel()==0);
#endif #endif
for(int e=0;e<ncache;e++){ for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+80 -76
View File
@@ -1,16 +1,15 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...) //#define dprintf(...)
//#define mprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@@ -51,12 +50,12 @@ int MemoryManager::EntryPresent(uint64_t CpuPtr)
{ {
if(AccViewTable.empty()) return 0; if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1)); auto count = AccViewTable.count(CpuPtr); GRID_ASSERT((count==0)||(count==1));
return count; return count;
} }
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
assert(!EntryPresent(CpuPtr)); GRID_ASSERT(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache; AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
@@ -70,9 +69,9 @@ void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,View
} }
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{ {
assert(EntryPresent(CpuPtr)); GRID_ASSERT(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr); auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end()); GRID_ASSERT(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator; return AccCacheIterator;
} }
void MemoryManager::EntryErase(uint64_t CpuPtr) void MemoryManager::EntryErase(uint64_t CpuPtr)
@@ -82,7 +81,7 @@ void MemoryManager::EntryErase(uint64_t CpuPtr)
} }
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.LRU_valid==0); GRID_ASSERT(AccCache.LRU_valid==0);
if (AccCache.transient) { if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr); LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end(); AccCache.LRU_entry = --LRU.end();
@@ -95,7 +94,7 @@ void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.LRU_valid==1); GRID_ASSERT(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry); LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0; AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes; DeviceLRUBytes-=AccCache.bytes;
@@ -109,19 +108,19 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
// Remove from Accelerator, remove entry, without flush // Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool. // Cannot be locked. If allocated Must be in LRU pool.
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); GRID_ASSERT(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) { if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++; DeviceDestroy++;
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL; AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@@ -139,9 +138,9 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
// Take these OUT LRU queue when CPU locked? // Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important. // Cannot take out the table as cpuLock data is important.
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); GRID_ASSERT(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; if (AccCache.accLock!=0) return;
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@@ -163,28 +162,30 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::Flush(AcceleratorViewEntry &AccCache) void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.state==AccDirty); GRID_ASSERT(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
} }
void MemoryManager::Clone(AcceleratorViewEntry &AccCache) void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.state==CpuDirty); GRID_ASSERT(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@@ -193,10 +194,10 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{ {
assert(AccCache.state!=Empty); GRID_ASSERT(AccCache.state!=Empty);
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
@@ -210,33 +211,36 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
} }
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else { } else {
assert(0); GRID_ASSERT(0);
return NULL; return NULL;
} }
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
assert(bytes<DeviceMaxBytes); if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
GRID_ASSERT(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
assert(LRU.size()>0); GRID_ASSERT(LRU.size()>0);
uint64_t victim = LRU.back(); // From the LRU uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim); auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
@@ -260,19 +264,19 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
if (!AccCache.AccPtr) { if (!AccCache.AccPtr) {
EvictVictims(bytes); EvictVictims(bytes);
} }
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); GRID_ASSERT((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error GRID_ASSERT(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
(uint64_t)bytes, (uint64_t)bytes,
(uint64_t)AccCache.accLock); (uint64_t)AccCache.accLock);
assert(AccCache.CpuPtr == CpuPtr); GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes); GRID_ASSERT(AccCache.bytes ==bytes);
} }
/* /*
* State transitions and actions * State transitions and actions
@@ -289,7 +293,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
* AccWrite AccDirty AccDirty - - * AccWrite AccDirty AccDirty - -
*/ */
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0); GRID_ASSERT(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes; AccCache.bytes = bytes;
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@@ -318,30 +322,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else { } else {
assert(0); GRID_ASSERT(0);
} }
assert(AccCache.accLock>0); GRID_ASSERT(AccCache.accLock>0);
// If view is opened on device must remove from LRU // If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n"); dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache); LRUremove(AccCache);
} }
@@ -358,16 +362,16 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0); GRID_ASSERT(AccCache.cpuLock==0);
assert(AccCache.accLock>0); GRID_ASSERT(AccCache.accLock>0);
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else { } else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -375,8 +379,8 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0); GRID_ASSERT(AccCache.cpuLock>0);
assert(AccCache.accLock==0); GRID_ASSERT(AccCache.accLock==0);
AccCache.cpuLock--; AccCache.cpuLock--;
} }
@@ -409,12 +413,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// EvictVictims(bytes); // EvictVictims(bytes);
// } // }
assert((mode==CpuRead)||(mode==CpuWrite)); GRID_ASSERT((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error GRID_ASSERT(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr); GRID_ASSERT(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes); GRID_ASSERT(AccCache.bytes==bytes);
} }
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
@@ -429,20 +433,20 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL); GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite) if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL); GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache); Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++; AccCache.cpuLock++;
} else { } else {
assert(0); // should be unreachable GRID_ASSERT(0); // should be unreachable
} }
AccCache.transient= transient? EvictNext : 0; AccCache.transient= transient? EvictNext : 0;
@@ -524,12 +528,12 @@ void MemoryManager::Audit(std::string s)
std::cout << " Memory Manager::Audit() from "<<s<<std::endl; std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){ for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it; uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr)); GRID_ASSERT(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr); auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes; LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1); GRID_ASSERT(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it); GRID_ASSERT(AccCache.LRU_entry==it);
} }
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl; std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
@@ -548,7 +552,7 @@ void MemoryManager::Audit(std::string s)
if( AccCache.LRU_valid ) LruCnt++; if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) { if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0); GRID_ASSERT(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
@@ -557,16 +561,16 @@ void MemoryManager::Audit(std::string s)
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl; << "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
} }
assert( AccCache.cpuLock== 0 ) ; GRID_ASSERT( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ; GRID_ASSERT( AccCache.accLock== 0 ) ;
} }
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl; std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2); GRID_ASSERT(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes); GRID_ASSERT(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes); GRID_ASSERT(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size()); GRID_ASSERT(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl; std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
} }
+5 -5
View File
@@ -10,16 +10,16 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY); int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0); GRID_ASSERT(fd >= 0);
const int page_size = 4096; const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size; uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn; off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size; uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages]; std::vector<uint64_t> pagedata(npages);
uint64_t ret = lseek(fd, offset, SEEK_SET); uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset); GRID_ASSERT(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages); GRID_ASSERT(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512; int nhugepages = npages / 512;
int n4ktotal, nnothuge; int n4ktotal, nnothuge;
n4ktotal = 0; n4ktotal = 0;
+5 -5
View File
@@ -82,6 +82,7 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim;
public: public:
@@ -91,7 +92,6 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim) =0; virtual int CheckerBoarded(int dim) =0;
virtual int CheckerBoard(const Coordinate &site)=0; virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerDim(void){ return 0; };
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
@@ -165,7 +165,7 @@ public:
// //
if ( _simd_layout[dimension] > 2 ) { if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) ); if ( d != dimension ) GRID_ASSERT ( (_simd_layout[d]==1) );
} }
permute_type = RotateBit; // How to specify distance; this is not just direction. permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type; return permute_type;
@@ -187,7 +187,7 @@ public:
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; }; inline const Coordinate &LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;}; inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;}; inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;}; inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
@@ -216,11 +216,11 @@ public:
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){ void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
assert(gidx< gSites()); GRID_ASSERT(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){ void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites()); GRID_ASSERT(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){ void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
+4 -3
View File
@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -106,6 +106,7 @@ public:
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);; _checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -127,10 +128,10 @@ public:
// Use a reduced simd grid // Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl; //std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+11 -11
View File
@@ -57,17 +57,17 @@ class GridRedBlackCartesian : public GridBase
{ {
public: public:
// Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; // int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
virtual int CheckerDim(void){ return _checker_dim; }; virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const Coordinate &site){ virtual int CheckerBoard(const Coordinate &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); GRID_ASSERT(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d]) if(_checker_dim_mask[d])
linear=linear+site[d]; linear=linear+site[d];
@@ -160,11 +160,11 @@ public:
_isCheckerBoarded = true; _isCheckerBoarded = true;
_checker_dim = checker_dim; _checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1); GRID_ASSERT(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension); GRID_ASSERT(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension); GRID_ASSERT(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension); GRID_ASSERT(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
@@ -190,20 +190,20 @@ public:
if (d == _checker_dim) if (d == _checker_dim)
{ {
assert((_gdimensions[d] & 0x1) == 0); GRID_ASSERT((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2; _gsites /= 2;
} }
_ldimensions[d] = _gdimensions[d] / _processors[d]; _ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid // Use a reduced simd grid
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0); GRID_ASSERT(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard. // all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d // If Ls vectorised, this must still be the case; e.g. dwf rb5d
+15 -4
View File
@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c) void CartesianCommunicator::GlobalSum(ComplexD &c)
{ {
GlobalSumVector((double *)&c,2); GlobalSumVector((double *)&c,2);
} }
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
+57 -14
View File
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ; extern bool Stencil_force_mpi ;
@@ -106,7 +108,7 @@ public:
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ; static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes); static void BroadcastWorld(int root,void* data, uint64_t bytes);
static void BarrierWorld(void); static void BarrierWorld(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@@ -128,6 +130,35 @@ public:
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering GRID_ASSERT in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
@@ -138,32 +169,44 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list); void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir); uint64_t bytes,int dir);
void SendToRecvFrom(void *xmit, void SendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes); uint64_t bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); uint64_t bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir); uint64_t xbytes,uint64_t rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp,
int recv_from_rank,int do_recv,
uint64_t xbytes,uint64_t rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -177,20 +220,20 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger // Broadcast a buffer and composite larger
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes); void Broadcast(int root,void* data, uint64_t bytes);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// All2All down one dimension // All2All down one dimension
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){ template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0); GRID_ASSERT(dim>=0);
assert(dim<_ndimension); GRID_ASSERT(dim<_ndimension);
assert(in.size()==out.size()); GRID_ASSERT(in.size()==out.size());
int numnode = _processors[dim]; int numnode = _processors[dim];
uint64_t bytes=sizeof(T); uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode; uint64_t words=in.size()/numnode;
assert(numnode * words == in.size()); GRID_ASSERT(numnode * words == in.size());
assert(words < (1ULL<<31)); GRID_ASSERT(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes); AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
} }
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes); void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);
+525 -109
View File
@@ -28,9 +28,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
void GridAbort(void) { MPI_Abort(MPI_COMM_WORLD,SIGABRT); }
extern void * Grid_backtrace_buffer[_NBACKTRACE];
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
#ifdef GRID_CHECKSUM_COMMS
uint64_t checksum_index = 1;
#endif
//////////////////////////////////////////// ////////////////////////////////////////////
// First initialise of comms system // First initialise of comms system
@@ -55,11 +63,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
#endif #endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0); GRID_ASSERT(0);
} }
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0); GRID_ASSERT(0);
} }
} }
@@ -80,20 +88,20 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0); GRID_ASSERT(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -120,8 +128,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); GRID_ASSERT(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; GRID_ASSERT(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1); Coordinate shm_processors (_ndimension,1);
@@ -145,7 +153,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
childsize *= processors[d]; childsize *= processors[d];
} }
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent); GRID_ASSERT (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent Coordinate scoor(_ndimension); // coor of split within parent
@@ -171,12 +179,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split); int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0); GRID_ASSERT(ierr==0);
} else { } else {
srank = 0; srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split); int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -201,7 +209,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
} }
} }
for(int d=0;d<processors.size();d++){ for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] ); GRID_ASSERT(_processor_coor[d] == ccoor[d] );
} }
} }
@@ -243,7 +251,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
for(int i=0;i<_ndimension*2;i++){ for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]); MPI_Comm_dup(communicator,&communicator_halo[i]);
} }
assert(Size==_Nprocessors); GRID_ASSERT(Size==_Nprocessors);
} }
CartesianCommunicator::~CartesianCommunicator() CartesianCommunicator::~CartesianCommunicator()
@@ -257,82 +265,103 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
void CartesianCommunicator::GlobalSum(uint32_t &u){ #ifdef USE_GRID_REDUCTION
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); FlightRecorder::StepLog("GlobalSumP2P");
assert(ierr==0); CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(double &d) void CartesianCommunicator::GlobalSum(double &d)
{ {
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir) uint64_t bytes,int dir)
{ {
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
assert(dest != _processor); GRID_ASSERT(dest != _processor);
assert(from != _processor); GRID_ASSERT(from != _processor);
int tag; int tag;
tag= dir+from*32; tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@@ -340,7 +369,7 @@ void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.resize(0); list.resize(0);
} }
@@ -349,50 +378,63 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) uint64_t bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
// Enforce no UVM in comms, device or host OK // Enforce no UVM in comms, device or host OK
assert(acceleratorIsCommunicable(xmit)); GRID_ASSERT(acceleratorIsCommunicable(xmit));
assert(acceleratorIsCommunicable(recv)); GRID_ASSERT(acceleratorIsCommunicable(recv));
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank,
recv,bytes,MPI_CHAR,from, from, recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
assert(ierr==0); GRID_ASSERT(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox, int dest, int dox,
void *recv, void *recv,
int from, int dor, int from, int dor,
int bytes,int dir) uint64_t bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
int CartesianCommunicator::IsOffNode(int rank)
{
int grank = ShmRanks[rank];
if ( grank == MPI_UNDEFINED ) return true;
else return false;
}
#undef NVLINK_GET // Define to use get instead of put DMA #ifdef ACCELERATOR_AWARE_MPI
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) uint64_t xbytes,uint64_t rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@@ -405,62 +447,431 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int gfrom = ShmRanks[from]; int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor]; int gme = ShmRanks[_processor];
assert(dest != _processor); GRID_ASSERT(dest != _processor);
assert(from != _processor); GRID_ASSERT(from != _processor);
assert(gme == ShmRank); GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag; int tag;
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); // std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
assert(ierr==0); ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
#ifdef NVLINK_GET #ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL); GRID_ASSERT(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif #endif
} }
// This is a NVLINK PUT
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); GRID_ASSERT(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
} else { } else {
#ifndef NVLINK_GET #ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL); GRID_ASSERT(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif #endif
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
GRID_ASSERT(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
#ifdef GRID_CHECKSUM_COMMS
rbytes += 8;
xbytes += 8;
#endif
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
#ifdef GRID_CHECKSUM_COMMS
uint64_t xbytes_data = xbytes - 8;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
GRID_ASSERT(xbytes % 8 == 0);
// flip one bit so that a zero buffer is not consistent
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
#else
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
#endif
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// GRID_ASSERT(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
} }
} }
return off_node_bytes; return off_node_bytes;
} }
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
#ifdef GRID_CHECKSUM_COMMS
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
#else
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
#endif
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint64_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
acceleratorCopySynchronise(); std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
if (nreq==0) return; for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
std::vector<MPI_Status> status(nreq); if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); // if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
assert(ierr==0);
list.resize(0);
} }
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
uint64_t rbytes_data = list[r].bytes - 8;
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
if (expected_cs != computed_cs) {
// TODO: error message, backtrace, quit
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
fprintf(stderr, " processor = %d\n", (int)_processor);
for(int d=0;d<_processors.size();d++)
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
fprintf(stderr, " hostname: %s\n", GridHostname());
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
fprintf(stderr, " dest: %d\n", list[r].dest);
fprintf(stderr, " tag: %d\n", list[r].tag);
fprintf(stderr, " commdir: %d\n", list[r].commdir);
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
fflush(stderr);
// backtrace
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
exit(1);
}
}
}
checksum_index += 1;
#endif
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);
} }
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -468,17 +879,19 @@ void CartesianCommunicator::StencilBarrier(void)
//} //}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes)
{ {
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
bytes, (int)bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator); communicator);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
int CartesianCommunicator::RankWorld(void){ int CartesianCommunicator::RankWorld(void){
int r; int r;
@@ -486,23 +899,25 @@ int CartesianCommunicator::RankWorld(void){
return r; return r;
} }
void CartesianCommunicator::BarrierWorld(void){ void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world); int ierr = MPI_Barrier(communicator_world);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, (int)bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator_world); communicator_world);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
Coordinate row(_ndimension,1); Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension); GRID_ASSERT(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
row[dim] = _processors[dim]; row[dim] = _processors[dim];
@@ -513,6 +928,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
} }
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{ {
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments // MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data. // 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
@@ -522,8 +938,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int ibytes; int ibytes;
iwords = words; iwords = words;
ibytes = bytes; ibytes = bytes;
assert(words == iwords); // safe to cast to int ? GRID_ASSERT(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ? GRID_ASSERT(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object); MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object); MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
+28 -12
View File
@@ -27,6 +27,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
void GridAbort(void) { abort(); }
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -34,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::Init(communicator_world);
@@ -54,14 +57,14 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_shm_processors = Coordinate(processors.size(),1); _shm_processors = Coordinate(processors.size(),1);
_processors = processors; _processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); GRID_ASSERT(_ndimension>=1);
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake // Require 1^N processor grid for fake
_Nprocessors=1; _Nprocessors=1;
_processor = 0; _processor = 0;
for(int d=0;d<_ndimension;d++) { for(int d=0;d<_ndimension;d++) {
assert(_processors[d]==1); GRID_ASSERT(_processors[d]==1);
_processor_coor[d] = 0; _processor_coor[d] = 0;
} }
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
@@ -87,19 +90,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes) uint64_t bytes)
{ {
assert(0); GRID_ASSERT(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);} void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ GRID_ASSERT(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
int bytes,int dir) uint64_t bytes,int dir)
{ {
assert(0); GRID_ASSERT(0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
@@ -113,8 +116,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int CartesianCommunicator::RankWorld(void){return 0;} int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { }
void CartesianCommunicator::BarrierWorld(void) { } void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
@@ -124,20 +127,33 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0; dest=0;
} }
int CartesianCommunicator::IsOffNode(int rank) { return false; }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int bytes, int dir) uint64_t bytes, int dir)
{ {
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
int xbytes,int rbytes, int dir) uint64_t xbytes,uint64_t rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit_comp,
int xmit_to_rank,int dox,
void *recv, void *recv_comp,
int recv_from_rank,int dor,
uint64_t xbytes,uint64_t rbytes, int dir)
{ {
return xbytes+rbytes; return xbytes+rbytes;
} }
+7 -7
View File
@@ -58,8 +58,8 @@ int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void) void GlobalSharedMemory::SharedMemoryFree(void)
{ {
assert(_ShmAlloc); GRID_ASSERT(_ShmAlloc);
assert(_ShmAllocBytes>0); GRID_ASSERT(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes); munmap(WorldShmCommBufs[r],_ShmAllocBytes);
} }
@@ -80,7 +80,7 @@ void *SharedMemory::HostBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size); GRID_ASSERT(host_heap_bytes<host_heap_size);
} }
return ptr; return ptr;
} }
@@ -100,7 +100,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size); GRID_ASSERT(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
@@ -127,13 +127,13 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
if ( str ) { if ( str ) {
std::vector<int> IntShmDims; std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims); GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size()); GRID_ASSERT(IntShmDims.size() == WorldDims.size());
long ShmSize = 1; long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) { for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]); ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim])); GRID_ASSERT(divides(ShmDims[dim],WorldDims[dim]));
} }
assert(ShmSize == WorldShmSize); GRID_ASSERT(ShmSize == WorldShmSize);
return; return;
} }
+33 -1
View File
@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
@@ -105,7 +137,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes); // static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };
+70 -67
View File
@@ -42,6 +42,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@@ -66,7 +67,7 @@ public:
{ {
int errnum; int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0); sock = socket(AF_UNIX, SOCK_DGRAM, 0); GRID_ASSERT(sock>0);
struct sockaddr_un sa_un = { 0 }; struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX; sa_un.sun_family = AF_UNIX;
@@ -157,7 +158,7 @@ public:
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
assert(_ShmSetup==0); GRID_ASSERT(_ShmSetup==0);
WorldComm = comm; WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank); MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize); MPI_Comm_size(WorldComm,&WorldSize);
@@ -183,7 +184,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// WorldNodes // WorldNodes
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize ); GRID_ASSERT( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
@@ -208,7 +209,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MyGroup.resize(WorldShmSize); MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){ for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){ if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize); GRID_ASSERT(g<WorldShmSize);
MyGroup[g++] = rank; MyGroup[g++] = rank;
} }
} }
@@ -224,7 +225,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// global sum leaders over comm world // global sum leaders over comm world
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0); GRID_ASSERT(ierr==0);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// find the group leaders world rank // find the group leaders world rank
@@ -245,7 +246,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNode=g; WorldNode=g;
} }
} }
assert(WorldNode!=-1); GRID_ASSERT(WorldNode!=-1);
_ShmSetup=1; _ShmSetup=1;
} }
// Gray encode support // Gray encode support
@@ -287,7 +288,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Assert power of two shm_size. // Assert power of two shm_size.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1); GRID_ASSERT(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname // Identify the hypercube coordinate of this node using hostname
@@ -308,7 +309,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Parse ICE-XA hostname to get hypercube location // Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3); GRID_ASSERT(nscan==3);
int nlo = N%9; int nlo = N%9;
int nhi = N/9; int nhi = N/9;
@@ -332,8 +333,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor; hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize); GRID_ASSERT(hypercoor<WorldSize);
assert(hypercoor>=0); GRID_ASSERT(hypercoor>=0);
////////////////////////////////////// //////////////////////////////////////
// Printing // Printing
@@ -381,7 +382,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
assert(WorldSize==Nprocessors); GRID_ASSERT(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -400,7 +401,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{ {
@@ -430,7 +431,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
assert(WorldSize==Nprocessors); // std::cerr << " WorldSize "<<WorldSize << " Nprocessors "<<Nprocessors<<" "<<processors<<std::endl;
GRID_ASSERT(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -446,7 +448,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET // SHMGET
@@ -455,8 +457,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
@@ -517,8 +519,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group // allocate the pointer array for shared windows for our group
@@ -537,7 +539,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
HostCommBuf= malloc(bytes); // printf("Host buffer allocate for GPU non-aware MPI\n");
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
@@ -545,11 +548,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << Mheader " acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl; if ( WorldRank == 0 ){
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -569,8 +574,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
@@ -580,8 +585,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid(); handle.pid = getpid();
@@ -626,7 +629,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
MPI_BYTE, MPI_BYTE,
r, r,
WorldShmComm); WorldShmComm);
assert(ierr==0); GRID_ASSERT(ierr==0);
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -640,12 +643,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS #ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor(); myfd=UnixSockets::RecvFileDescriptor();
#else #else
std::cout<<"mapping seeking remote pid/fd " // std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/" // <<handle.pid<<"/"
<<handle.fd<<std::endl; // <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0); int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; // std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0); myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno; int err_t = errno;
@@ -655,7 +658,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0); assert(0);
} }
#endif #endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; // std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -664,11 +667,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
} }
assert(thisBuf!=nullptr); GRID_ASSERT(thisBuf!=nullptr);
} }
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@@ -709,8 +709,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -740,13 +740,14 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0); perror("failed mmap"); GRID_ASSERT(0);
} }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
}; };
@@ -756,8 +757,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -768,7 +769,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbf and others map filesystems as mappable huge pages // Hugetlbf and others map filesystems as mappable huge pages
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX]; char shm_name [NAME_MAX];
assert(WorldShmSize == 1); GRID_ASSERT(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
int fd=-1; int fd=-1;
@@ -782,9 +783,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0); perror("failed mmap"); GRID_ASSERT(0);
} }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -803,8 +804,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize); WorldShmCommBufs.resize(WorldShmSize);
@@ -835,7 +836,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
} }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -856,8 +857,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( fd<0 ) { perror("failed shm_open"); assert(0); } if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); } if ( ptr == MAP_FAILED ) { perror("failed mmap"); GRID_ASSERT(0); }
assert(((uint64_t)ptr&0x3F)==0); GRID_ASSERT(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -880,14 +881,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) //#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
#else //#else
bcopy(src,dest,bytes); // bcopy(src,dest,bytes);
#endif //#endif
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@@ -914,7 +915,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer // Map ShmRank to WorldShmRank and use the right buffer
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1); GRID_ASSERT (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes(); heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
@@ -923,6 +924,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@@ -975,19 +977,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2]=magic; check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t)); acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier(); acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t)); GRID_ASSERT(check[0]==GlobalSharedMemory::WorldNode);
ShmBarrier(); GRID_ASSERT(check[1]==r);
assert(check[0]==GlobalSharedMemory::WorldNode); GRID_ASSERT(check[2]==magic);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
} }
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
@@ -1002,12 +1003,14 @@ void *SharedMemory::ShmBuffer(int rank)
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self GRID_ASSERT(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
return NULL; return NULL;
} else { } else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
return (void *) remote; return (void *) remote;
} }
} }
+10 -10
View File
@@ -34,7 +34,7 @@ NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
assert(_ShmSetup==0); GRID_ASSERT(_ShmSetup==0);
WorldComm = 0; WorldComm = 0;
WorldRank = 0; WorldRank = 0;
WorldSize = 1; WorldSize = 1;
@@ -62,8 +62,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ; void * ShmCommBuf ;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
@@ -92,8 +92,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
assert(_ShmSetup==1); GRID_ASSERT(_ShmSetup==1);
assert(_ShmAlloc==0); GRID_ASSERT(_ShmAlloc==0);
int mmap_flag =0; int mmap_flag =0;
#ifdef MAP_ANONYMOUS #ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS; mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
@@ -122,17 +122,17 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
acceleratorMemSet(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{ {
assert(GlobalSharedMemory::ShmAlloc()==1); GRID_ASSERT(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1); ShmRanks.resize(1);
ShmCommBufs.resize(1); ShmCommBufs.resize(1);
ShmRanks[0] = 0; ShmRanks[0] = 0;
-1
View File
@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{ {
+8 -83
View File
@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table; extern std::vector<std::pair<int,int> > Cshift_table;
extern commVector<std::pair<int,int> > Cshift_table_device; extern deviceVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void) inline std::pair<int,int> *MapCshiftTable(void)
{ {
// GPU version // GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size(); uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) { if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz); Cshift_table_device.resize(sz);
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
sizeof(Cshift_table[0])*sz); sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0]; return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map // CPU version use identify map
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWriteDiscard);
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@@ -278,8 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWriteDiscard);
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
int b = nn/e1; int b = nn/e1;
@@ -287,21 +236,13 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout << "Scatter_plane merge GRID_ASSERT(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU GRID_ASSERT(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite); autoView(lhs_v , lhs, AcceleratorWriteDiscard);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }
+190 -252
View File
@@ -29,9 +29,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_MPI_H_ #ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_CHECKSUM_COMMS
extern uint64_t checksum_index;
#endif
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -45,6 +49,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
if( shift ==0 ) {
ret = rhs;
return ret;
}
//
// Potential easy fast cases:
// Shift is a multiple of the local lattice extent.
// Then need only to shift whole subvolumes
int L = rhs.Grid()->_ldimensions[dimension];
if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
Cshift_simple(ret,rhs,dimension,shift);
return ret;
}
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type // the permute type
@@ -65,10 +83,59 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
Cshift_comms(ret,rhs,dimension,shift); Cshift_comms(ret,rhs,dimension,shift);
} }
t1=usecond(); t1=usecond();
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret; return ret;
} }
template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
GridBase *grid=rhs.Grid();
int comm_proc, xmit_to_rank, recv_from_rank;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int ld = rhs.Grid()->_ldimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
comm_proc = ((shift)/ld)%pd;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
if(comm_dim) {
int64_t bytes = sizeof(vobj) * grid->oSites();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(ret_v , ret, AcceleratorWrite);
void *send_buf = (void *)&rhs_v[0];
void *recv_buf = (void *)&ret_v[0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom(send_buf,
xmit_to_rank,
recv_buf,
recv_from_rank,
bytes);
#else
static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
static hostVector<vobj> hret; hret.resize(grid->oSites());
void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(send_buf,hsend_buf,bytes);
grid->SendToRecvFrom(hsend_buf,
xmit_to_rank,
hrecv_buf,
recv_from_rank,
bytes);
acceleratorCopyToDevice(hrecv_buf,recv_buf,bytes);
#endif
}
}
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{ {
int sshift[2]; int sshift[2];
@@ -94,7 +161,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3); Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@@ -104,8 +171,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -119,14 +184,19 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int pd = rhs.Grid()->_processors[dimension]; int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1); GRID_ASSERT(simd_layout==1);
assert(comm_dim==1); GRID_ASSERT(comm_dim==1);
assert(shift>=0); GRID_ASSERT(shift>=0);
assert(shift<fd); GRID_ASSERT(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size); static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size); static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
#endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -141,9 +211,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) { if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond(); tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask); Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond(); tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else { } else {
int words = buffer_size; int words = buffer_size;
@@ -151,39 +223,84 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond(); tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond(); tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
// grid->Barrier(); grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
GRID_ASSERT(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
GRID_ASSERT(expected_cs == computed_cs);
#else
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes; xbytes+=bytes;
// grid->Barrier(); grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
} }
/* if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/ }
} }
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -205,10 +322,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout // << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; // << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1); GRID_ASSERT(comm_dim==1);
assert(simd_layout==2); GRID_ASSERT(simd_layout==2);
assert(shift>=0); GRID_ASSERT(shift>=0);
assert(shift<fd); GRID_ASSERT(shift<fd);
RealD tcopy=0.0; RealD tcopy=0.0;
RealD tgather=0.0; RealD tgather=0.0;
@@ -224,8 +341,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi; scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi; scalar_object * send_buf_extract_mpi;
@@ -233,6 +350,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract[s].resize(buffer_size); send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI
#ifdef GRID_CHECKSUM_COMMS
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
#endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -275,252 +404,62 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if (nbr_ic) nbr_lane|=inner_bit; if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox); GRID_ASSERT (sx == nbr_ox);
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
// grid->Barrier(); grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi, grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
#else #else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) // bouncy bouncy
{ acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
typedef typename vobj::vector_type vector_type; #ifdef GRID_CHECKSUM_COMMS
typedef typename vobj::scalar_type scalar_type; assert(bytes % 8 == 0);
checksum_index++;
GridBase *grid=rhs.Grid(); uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
Lattice<vobj> temp(rhs.Grid()); *(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
int fd = rhs.Grid()->_fdimensions[dimension]; #endif
int rd = rhs.Grid()->_rdimensions[dimension]; grid->SendToRecvFrom((void *)&hsend_buf[0],
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&hrecv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
xbytes+=bytes; #ifdef GRID_CHECKSUM_COMMS
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); bytes -= 8;
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
// grid->Barrier(); std::cout << GridLogComms<< " Cshift_comms_simd: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
rpointers[i] = &send_buf_extract[nbr_lane][0]; rpointers[i] = &send_buf_extract[nbr_lane][0];
@@ -530,17 +469,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
/* if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
} }
#endif }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif
+1 -1
View File
@@ -1,5 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table; std::vector<std::pair<int,int> > Cshift_table;
commVector<std::pair<int,int> > Cshift_table_device; deviceVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
+1 -1
View File
@@ -245,7 +245,7 @@ template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * =
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.Checkerboard()); GRID_ASSERT(cb == lat.Checkerboard());
} }
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }
+13
View File
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product
+22 -19
View File
@@ -120,12 +120,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -144,12 +144,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -168,12 +168,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); GRID_ASSERT(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
@@ -191,11 +191,11 @@ public:
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr); GRID_ASSERT(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -206,11 +206,11 @@ public:
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr); GRID_ASSERT(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -221,11 +221,11 @@ public:
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr); GRID_ASSERT(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); GRID_ASSERT( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -237,16 +237,19 @@ public:
vobj vtmp; vobj vtmp;
vtmp = r; vtmp = r;
#if 1 #if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp);
});
#else
auto me = View(CpuWrite); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss]= r; me[ss]= r;
}); });
#else
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(vtmp);
coalescedWrite(me[ss],stmp);
});
#endif #endif
me.ViewClose(); me.ViewClose();
return *this; return *this;
@@ -261,7 +264,7 @@ public:
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); GRID_ASSERT((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode); SetViewMode(mode);
} }
+29 -42
View File
@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size()); hostVector<View> h_basis_v(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj; deviceVector<View> d_basis_v(basis.size());
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite)); h_basis_v[k] = basis[k].View(AcceleratorWrite);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) View *basis_vp = &d_basis_v[0];
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot); deviceVector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm); hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
Qt_p[i]=Qt(j,k); h_Qt_jv[i]=Qt(j,k);
}); });
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size()); hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead)); h_basis_v[k]=basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0]; vobj zz=Zero();
deviceVector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
auto basis_vp=& d_basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); vobj zzz=Zero();
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
} }
template<class Field> template<class Field>
@@ -179,9 +166,9 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
{ {
int vlen = idx.size(); int vlen = idx.size();
assert(vlen>=1); GRID_ASSERT(vlen>=1);
assert(vlen<=sort_vals.size()); GRID_ASSERT(vlen<=sort_vals.size());
assert(vlen<=_v.size()); GRID_ASSERT(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) { for (size_t i=0;i<vlen;i++) {
@@ -199,7 +186,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
if (idx[j]==i) if (idx[j]==i)
break; break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i); GRID_ASSERT(idx[i] > i); GRID_ASSERT(j!=idx.size()); GRID_ASSERT(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
@@ -237,7 +224,7 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
template<class Field> template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) { void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero(); result = Zero();
assert(_v.size()==eval.size()); GRID_ASSERT(_v.size()==eval.size());
int N = (int)_v.size(); int N = (int)_v.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
Field& tmp = _v[i]; Field& tmp = _v[i];
+2 -2
View File
@@ -32,8 +32,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{ {
assert(lhs.Grid() == rhs.Grid()); GRID_ASSERT(lhs.Grid() == rhs.Grid());
assert(lhs.Checkerboard() == rhs.Checkerboard()); GRID_ASSERT(lhs.Checkerboard() == rhs.Checkerboard());
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
+3 -3
View File
@@ -42,7 +42,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -86,7 +86,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid(); GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1); GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -140,7 +140,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1); GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl = nh-1; // int nl = nh-1;
+10 -10
View File
@@ -98,8 +98,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); GRID_ASSERT( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx; int rank,odx,idx;
// Optional to broadcast from node 0. // Optional to broadcast from node 0.
@@ -135,7 +135,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); GRID_ASSERT( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx; int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -159,14 +159,14 @@ template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site) inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid = l.getGrid(); GridBase *grid = l.getGrid();
assert(l.mode==CpuRead); GRID_ASSERT(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); // GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
@@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx); pt[w] = getlane(vp[w],idx);
} }
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return; return;
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
@@ -195,15 +195,15 @@ template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid=l.getGrid(); GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite); GRID_ASSERT(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); // GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
+127 -203
View File
@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); std::vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -290,23 +290,45 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t csum=0; // uint64_t csum=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) // uint64_t csum2=0;
{ // if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
// {
// Hack // Hack
// Fast integer xor checksum. Can also be used in comms now. // Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead); // autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); // Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0]; // uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words); // csum=svm_xor(base,words);
} // ok = FlightRecorder::CsumLog(csum);
FlightRecorder::CsumLog(csum); // if ( !ok ) {
// csum2=svm_xor(base,words);
// std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// } else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// }
// GRID_ASSERT(ok);
// }
#endif #endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right); ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm); RealD local = real(nrm);
FlightRecorder::NormLog(real(nrm)); ok = FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm); if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
GRID_ASSERT(ok);
}
FlightRecorder::StepLog("Start global sum");
grid->GlobalSumP2P(nrm);
// grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
deviceVector<inner_t> inner_tmp; deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites); inner_tmp.resize(sites);
@@ -365,9 +375,13 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
bool ok;
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
#endif ok = FlightRecorder::NormLog(real(nrm));
GRID_ASSERT(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -377,7 +391,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right); conformable(left,right);
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2); std::vector<ComplexD> tmp(2);
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
@@ -387,8 +401,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU // GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites); deviceVector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); deviceVector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{ {
@@ -438,7 +452,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{ {
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// FIXME precision promoted summation // FIXME precision promoted summation
@@ -448,20 +464,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type; typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
assert(grid!=NULL); GRID_ASSERT(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0); GRID_ASSERT(orthogdim >= 0);
assert(orthogdim < Nd); GRID_ASSERT(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first std::vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
@@ -509,6 +525,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
scalar_type * ptr = (scalar_type *) &result[0]; scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
} }
template<class vobj> inline template<class vobj> inline
std::vector<typename vobj::scalar_object> std::vector<typename vobj::scalar_object>
@@ -519,28 +537,41 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result; return result;
} }
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid(); GridBase *grid = lhs.Grid();
assert(grid!=NULL); GRID_ASSERT(grid!=NULL);
conformable(grid,rhs.Grid()); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0); GRID_ASSERT(orthogdim >= 0);
assert(orthogdim < Nd); GRID_ASSERT(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first std::vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -670,203 +701,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
} }
}; };
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{ {
int NN = BlockSolverGrid->_ndimension; int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd(); int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0); std::vector<int> latt_phys(NN-1);
std::vector<int> simd_phys(0); Coordinate simd_phys;
std::vector<int> mpi_phys(0); std::vector<int> mpi_phys(NN-1);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){ for(int d=0;d<NN;d++){
if( d!=Orthog ) { if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]); latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]); mpi_phys[dd] =BlockSolverGrid->_processors[d];
mpi_phys.push_back(BlockSolverGrid->_processors[d]); checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
} }
} }
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
} }
*/
template<class vobj> template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
GridBase *FullGrid = X.Grid();
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
RR = R; // Copies checkerboard for insert
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; for(int i=0;i<Nslice;i++){
ExtractSlice(Ys,Y,i,Orthog);
GridBase *FullGrid = X.Grid(); ExtractSlice(Rs,R,i,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); Rs=Ys;
for(int j=0;j<Nslice;j++){
// Lattice<vobj> Xslice(SliceGrid); ExtractSlice(Xs,X,j,Orthog);
// Lattice<vobj> Rslice(SliceGrid); Rs = Rs + Xs*(scale*aa(j,i));
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
} }
InsertSlice(Rs,RR,i,Orthog);
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
} }
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
}; };
template<class vobj> template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; R=Zero();
typedef typename vobj::vector_type vector_type; sliceMaddMatrix(R,aa,X,R,Orthog,scale);
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
}; };
template<class vobj> template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = lhs.Grid(); mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); for(int s=0;s<Nslice;s++){
ExtractSlice(ls,lhs,s,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; for(int ss=0;ss<Nslice;ss++){
ExtractSlice(rs,rhs,ss,Orthog);
// Lattice<vobj> Lslice(SliceGrid); mat(s,ss) = innerProduct(ls,rs);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
} }
} }
delete SliceGrid;
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
+3 -13
View File
@@ -208,28 +208,18 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
Integer numThreads, numBlocks; Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok); GRID_ASSERT(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM // Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream // Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise // as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER deviceVector<sobj> buffer(numBlocks);
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result; return result;
} }
@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites); deviceVector<vector> buffer(osites);
vector *dat = (vector *)lat; vector *dat = (vector *)lat;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0]; iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
+37 -60
View File
@@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum // Possibly promote to double and sum
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD; typedef typename vobj::scalar_objectD sobjD;
static Vector<sobj> mysum;
mysum.resize(1);
sobj *mysum_p = & mysum[0];
sobj identity; zeroit(identity); sobj identity; zeroit(identity);
mysum[0] = identity; sobj ret; zeroit(ret);
sobj ret ;
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites}, cgh.parallel_for(sycl::range<1>{osites},
Reduction, Reduction,
[=] (cl::sycl::id<1> item, auto &sum) { [=] (sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
}); });
theGridAccelerator->wait(); }
ret = mysum[0];
// free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret); sobjD dret; convertType(dret,ret);
return dret; return dret;
} }
@@ -76,59 +71,41 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L) template<class Word> Word svm_xor(Word *vec,uint64_t L)
{ {
Word xorResult; xorResult = 0;
static Vector<Word> d_sum;
d_sum.resize(1);
Word *d_sum_p=&d_sum[0];
Word identity; identity=0; Word identity; identity=0;
d_sum[0] = identity; Word ret = 0;
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { sycl::buffer<Word, 1> abuff(&ret, {1});
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); theGridAccelerator->submit([&](sycl::handler &cgh) {
cgh.parallel_for(cl::sycl::range<1>{L}, auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (cl::sycl::id<1> index, auto &sum) { [=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index]; sum ^=vec[index];
}); });
}); });
}
theGridAccelerator->wait();
return ret;
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
auto l = index % 61;
sum ^= vec[index]<<l | vec[index]>>(64-l);
});
});
}
theGridAccelerator->wait(); theGridAccelerator->wait();
Word ret = d_sum[0];
// free(d_sum,*theGridAccelerator);
return ret; return ret;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/
+13 -13
View File
@@ -53,10 +53,10 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0); GRID_ASSERT(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){ for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1); GRID_ASSERT(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1); GRID_ASSERT(fine->_processors[d]==1);
} }
int multiplicity=1; int multiplicity=1;
@@ -66,9 +66,9 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){ for(int d=0;d<rngdims;d++){
int fd= d+lowerdims; int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]); GRID_ASSERT(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]); GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); GRID_ASSERT(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
} }
@@ -83,18 +83,18 @@ inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0); int lowerdims = fine->_ndimension - coarse->_ndimension; GRID_ASSERT(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors // assumes that the higher dimensions are not using more processors
// all further divisions are local // all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1); for(int d=0;d<lowerdims;d++) GRID_ASSERT(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]); for(int d=0;d<rngdims;d++) GRID_ASSERT(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites // then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same // check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd()); GRID_ASSERT(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly // check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); GRID_ASSERT( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites(); return fine->lSites() / coarse->lSites();
} }
@@ -177,7 +177,7 @@ public:
skip = skip<<shift; skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow GRID_ASSERT((skip >> shift)==site); // check for overflow
eng.discard(skip); eng.discard(skip);
#else #else
@@ -218,7 +218,7 @@ public:
GetState(saved,_generators[gen]); GetState(saved,_generators[gen]);
} }
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount); GRID_ASSERT(saved.size()==RngStateCount);
std::stringstream ss; std::stringstream ss;
for(int i=0;i<RngStateCount;i++){ for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" "; ss<< saved[i]<<" ";
+66 -23
View File
@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP) #if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { template<class vobj>
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
commVector<vobj> reduction_buffer(rd*subvol_size); deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
vobj zero_init; vobj zero_init;
zeroit(zero_init); zeroit(zero_init);
@@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device //copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy //sync after copy
accelerator_barrier(); accelerator_barrier();
@@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
#if defined(GRID_SYCL) #if defined(GRID_SYCL)
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj>
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
@@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
mysum[r] = vobj_zero; mysum[r] = vobj_zero;
} }
commVector<vobj> reduction_buffer(rd*subvol_size); deviceVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
@@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size}, cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](cl::sycl::id<1> item, auto &sum) { [=](sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });
@@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
} }
#endif #endif
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { template<class vobj>
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector; typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2; const int osites = rd*e1*e2;
commVector<vector>buffer(osites); deviceVector<vector>buffer(osites);
vector *dat = (vector *)Data; vector *dat = (vector *)Data;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
Vector<vector> lvSum_small(rd); std::vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0]; vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) { for (int w = 0; w < words; w++) {
@@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r]; lvSum_ptr[w+words*r]=lvSum_small[r];
} }
}
} }
template<class vobj>
} inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{ {
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) { if constexpr (sizeof(vobj) <= 256) {
@@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
} }
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj>
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
@@ -208,15 +247,19 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
}); });
} }
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else #else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif #endif
} }
+113 -74
View File
@@ -31,32 +31,61 @@ NAMESPACE_BEGIN(Grid);
inline void subdivides(GridBase *coarse,GridBase *fine) inline void subdivides(GridBase *coarse,GridBase *fine)
{ {
assert(coarse->_ndimension == fine->_ndimension); GRID_ASSERT(coarse->_ndimension == fine->_ndimension);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
assert(coarse->_processors[d] == fine->_processors[d]); GRID_ASSERT(coarse->_processors[d] == fine->_processors[d]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[d]); GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[d]);
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); GRID_ASSERT((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{ {
acceleratorPickCheckerboard(cb,half,full); half.Checkerboard() = cb;
autoView( half_v, half, CpuWrite);
autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{
int cbos;
Coordinate coor;
full.Grid()->oCoorFromOindex(coor,ss);
cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) {
int ssh=half.Grid()->oIndex(coor);
half_v[ssh] = full_v[ss];
}
});
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{ {
acceleratorSetCheckerboard(full,half); int cb = half.Checkerboard();
autoView( half_v , half, CpuRead);
autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
int cbos;
full.Grid()->oCoorFromOindex(coor,ss);
cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) {
int ssh=half.Grid()->oIndex(coor);
full_v[ss]=half_v[ssh];
}
});
} }
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0) template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
{ {
half.Checkerboard() = cb; half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite); autoView(half_v, half, AcceleratorWrite);
@@ -66,7 +95,6 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
unsigned long ndim_half = half.Grid()->_ndimension; unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride; Coordinate ostride_half = half.Grid()->_ostride;
int checker_dim_half = half.Grid()->CheckerDim();
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor; Coordinate coor;
@@ -91,7 +119,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
} }
}); });
} }
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0) template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
{ {
int cb = half.Checkerboard(); int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead); autoView(half_v , half, AcceleratorRead);
@@ -101,7 +129,6 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
unsigned long ndim_half = half.Grid()->_ndimension; unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride; Coordinate ostride_half = half.Grid()->_ostride;
int checker_dim_half = half.Grid()->CheckerDim();
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor; Coordinate coor;
@@ -282,7 +309,7 @@ inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &co
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = fineData.size(); int NBatch = fineData.size();
assert(coarseData.size() == NBatch); GRID_ASSERT(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid(); GridBase * coarse= coarseData[0].Grid();
@@ -317,7 +344,7 @@ template<class vobj,class vobj2,class CComplex>
GridBase * coarse= coarseA.Grid(); GridBase * coarse= coarseA.Grid();
fineZ.Checkerboard()=fineX.Checkerboard(); fineZ.Checkerboard()=fineX.Checkerboard();
assert(fineX.Checkerboard()==fineY.Checkerboard()); GRID_ASSERT(fineX.Checkerboard()==fineY.Checkerboard());
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
conformable(fineX,fineY); conformable(fineX,fineY);
conformable(fineX,fineZ); conformable(fineX,fineZ);
@@ -329,7 +356,7 @@ template<class vobj,class vobj2,class CComplex>
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); GRID_ASSERT(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
autoView( fineZ_ , fineZ, AcceleratorWrite); autoView( fineZ_ , fineZ, AcceleratorWrite);
@@ -586,7 +613,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// checks // checks
assert( nbasis == Basis.size() ); GRID_ASSERT( nbasis == Basis.size() );
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine); conformable(Basis[i].Grid(),fine);
@@ -660,7 +687,7 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = coarseData.size(); int NBatch = coarseData.size();
assert(fineData.size() == NBatch); GRID_ASSERT(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid(); GridBase * coarse = coarseData[0].Grid();
@@ -688,12 +715,12 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
int ni = ig->_ndimension; int ni = ig->_ndimension;
int no = og->_ndimension; int no = og->_ndimension;
assert(ni == no); GRID_ASSERT(ni == no);
for(int d=0;d<no;d++){ for(int d=0;d<no;d++){
assert(ig->_processors[d] == og->_processors[d]); GRID_ASSERT(ig->_processors[d] == og->_processors[d]);
assert(ig->_ldimensions[d] == og->_ldimensions[d]); GRID_ASSERT(ig->_ldimensions[d] == og->_ldimensions[d]);
assert(ig->lSites() == og->lSites()); GRID_ASSERT(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead); autoView(in_v,in,CpuRead);
@@ -725,16 +752,16 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
int nd = nF; int nd = nF;
assert(nF == nT); GRID_ASSERT(nF == nT);
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]); GRID_ASSERT(Fg->_processors[d] == Tg->_processors[d]);
} }
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
@@ -794,12 +821,12 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
assert(nF+1 == nT); GRID_ASSERT(nF+1 == nT);
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// do the index calc on the GPU // do the index calc on the GPU
@@ -863,12 +890,12 @@ void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, in
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded); GRID_ASSERT(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded); GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
assert(nT+1 == nF); GRID_ASSERT(nT+1 == nF);
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// do the index calc on the GPU // do the index calc on the GPU
@@ -928,16 +955,16 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
assert(nl+1 == nh); GRID_ASSERT(nl+1 == nh);
assert(orthog<nh); GRID_ASSERT(orthog<nh);
assert(orthog>=0); GRID_ASSERT(orthog>=0);
assert(hg->_processors[orthog]==1); GRID_ASSERT(hg->_processors[orthog]==1);
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
assert(lg->_processors[dl] == hg->_processors[d]); GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]);
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]); GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -954,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice; hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
} }
ddl++;
}
} }
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDimv,hcoor);
@@ -972,16 +1005,17 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
assert(nl+1 == nh); GRID_ASSERT(nl+1 == nh);
assert(orthog<nh); GRID_ASSERT(orthog<nh);
assert(orthog>=0); GRID_ASSERT(orthog>=0);
assert(hg->_processors[orthog]==1); GRID_ASSERT(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
assert(lg->_processors[dl] == hg->_processors[d]); GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]);
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]); GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -993,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl); Coordinate lcoor(nl);
Coordinate hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0;
hcoor[orthog] = slice; hcoor[orthog] = slice;
int ddl=0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
} }
} }
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDimv,hcoor);
@@ -1017,14 +1056,14 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
assert(nl == nh); GRID_ASSERT(nl == nh);
assert(orthog<nh); GRID_ASSERT(orthog<nh);
assert(orthog>=0); GRID_ASSERT(orthog>=0);
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]); GRID_ASSERT(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]); GRID_ASSERT(lg->_ldimensions[d] == hg->_ldimensions[d]);
} }
} }
Coordinate sz = lg->_ldimensions; Coordinate sz = lg->_ldimensions;
@@ -1054,7 +1093,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
subdivides(cg,fg); subdivides(cg,fg);
assert(cg->_ndimension==fg->_ndimension); GRID_ASSERT(cg->_ndimension==fg->_ndimension);
Coordinate ratio(cg->_ndimension); Coordinate ratio(cg->_ndimension);
@@ -1118,7 +1157,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
int lex; int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions); Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
assert(lex < out.size()); GRID_ASSERT(lex < out.size());
out_ptrs[lane] = &out[lex]; out_ptrs[lane] = &out[lex];
} }
@@ -1182,7 +1221,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out.Grid(); GridBase* grid = out.Grid();
assert(in.size()==grid->lSites()); GRID_ASSERT(in.size()==grid->lSites());
const int ndim = grid->Nd(); const int ndim = grid->Nd();
constexpr int nsimd = vtype::Nsimd(); constexpr int nsimd = vtype::Nsimd();
@@ -1229,7 +1268,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid; GridBase* grid = out._grid;
assert(in.size()==grid->lSites()); GRID_ASSERT(in.size()==grid->lSites());
int ndim = grid->Nd(); int ndim = grid->Nd();
int nsimd = vtype::Nsimd(); int nsimd = vtype::Nsimd();
@@ -1290,9 +1329,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{ {
assert(out.Grid()->Nd() == in.Grid()->Nd()); GRID_ASSERT(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){ for(int d=0;d<out.Grid()->Nd();d++){
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]); GRID_ASSERT(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
} }
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid(); GridBase *in_grid=in.Grid();
@@ -1343,9 +1382,9 @@ class precisionChangeWorkspace{
public: public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd()); GRID_ASSERT(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){ for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); GRID_ASSERT(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
} }
int Nsimd_out = out_grid->Nsimd(); int Nsimd_out = out_grid->Nsimd();
@@ -1510,7 +1549,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
assert(full_vecs>=1); GRID_ASSERT(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1528,18 +1567,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension); GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
assert(full[n].Checkerboard() == cb); GRID_ASSERT(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc); GRID_ASSERT(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs); GRID_ASSERT(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1583,7 +1622,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A // Loop over reordered data post A2A
thread_for(c, chunk, { thread_for(c, chunk, {
@@ -1636,7 +1675,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
assert(full_vecs>=1); GRID_ASSERT(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1654,18 +1693,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension); GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
assert(full[n].Checkerboard() == cb); GRID_ASSERT(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc); GRID_ASSERT(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs); GRID_ASSERT(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1701,7 +1740,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
auto lsites= rsites/M; // Decreases rsites by M auto lsites= rsites/M; // Decreases rsites by M
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol);
{ {
// Loop over reordered data post A2A // Loop over reordered data post A2A
+42
View File
@@ -106,6 +106,47 @@ public:
} }
}; };
#ifdef GRID_LOG_VIEWS
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
const char* filename; int line, mode;
public:
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
v(_v), filename(_filename), line(_line), mode(_mode) {
switch (mode){
case AcceleratorRead:
case AcceleratorWrite:
case CpuRead:
case CpuWrite:
ViewLogger::LogOpen(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
};
~ViewCloser() {
switch (mode) {
case AcceleratorWriteDiscard:
case AcceleratorWrite:
case CpuWrite:
ViewLogger::LogClose(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
v.ViewClose();
}
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
#else
// Little autoscope assister // Little autoscope assister
template<class View> template<class View>
class ViewCloser class ViewCloser
@@ -119,6 +160,7 @@ class ViewCloser
#define autoView(l_v,l,mode) \ #define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \ auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v); ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
#endif
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST // Lattice expression types used by ET to assemble the AST
+44 -14
View File
@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
* *
*/ */
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf, template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
Lattice<vobj> &lat, Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -82,10 +82,10 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d]; int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal int rNsimda= Nsimd/simd[dim]; // should be equal
assert(rNsimda==rNsimd); GRID_ASSERT(rNsimda==rNsimd);
int face_ovol=block*nblock; int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd); // GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/ /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that //Let's make it work on GPU and then make a special accelerator_for that
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
}); });
} }
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf, template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
const Lattice<vobj> &lat, const Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -172,7 +172,7 @@ template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
int face_ovol=block*nblock; int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd); // GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/ /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that //Let's make it work on GPU and then make a special accelerator_for that
@@ -247,7 +247,7 @@ public:
Coordinate local =unpadded_grid->LocalDimensions(); Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid(); Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){ for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) assert(local[d]>=depth); if ( procs[d] > 1 ) GRID_ASSERT(local[d]>=depth);
} }
} }
void DeleteGrids(void) void DeleteGrids(void)
@@ -448,9 +448,9 @@ public:
int nld = to.Grid()->_ldimensions[dimension]; int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
assert(depth<=lds[dimension]); // A must be on neighbouring node GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node
assert(depth>0); // A caller bug if zero GRID_ASSERT(depth>0); // A caller bug if zero
assert(ld+2*depth==nld); GRID_ASSERT(ld+2*depth==nld);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations // Face size and byte calculations
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@@ -460,15 +460,21 @@ public:
} }
buffer_size = buffer_size / Nsimd; buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension]; int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); GRID_ASSERT( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static cshiftVector<vobj> send_buf; static deviceVector<vobj> send_buf;
static cshiftVector<vobj> recv_buf; static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<CommsRequest_t> fwd_req; std::vector<MpiCommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req; std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -495,9 +501,16 @@ public:
t_gather+=usecond()-t; t_gather+=usecond()-t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req, grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
for ( int d=0;d < depth ; d ++ ) { for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
t_gather+= usecond() - t; t_gather+= usecond() - t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req, grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
@@ -533,6 +553,11 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(fwd_req); grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();
@@ -543,6 +568,11 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(bwd_req); grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();
+3
View File
@@ -69,6 +69,7 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL"); GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -84,6 +85,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogDslash.Active(0); GridLogDslash.Active(0);
GridLogComms.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1); GridLogHMC.Active(1);
@@ -97,6 +99,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);

Some files were not shown because too many files have changed in this diff Show More