1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-25 10:09:34 +01:00

Compare commits

...

189 Commits

Author SHA1 Message Date
dbollweg
7f9d06f339 Merge 461cd045c6 into ccf147d6c1 2024-05-16 16:40:21 -04:00
Peter Boyle
ccf147d6c1 Select the compiler that gives better performance on sunspot 2024-05-07 18:45:56 +00:00
Peter Boyle
7aa12b446f New config command for sunspot 2024-05-07 18:45:40 +00:00
Peter Boyle
c293228102 layout control 2024-05-07 18:45:21 +00:00
Peter Boyle
5c4c9f721a Remove pbs file and replace with bench1 and bench2 for 1 and 2 nodes 2024-05-07 18:44:49 +00:00
Peter Boyle
057f86c1de 2 queues works ok in performance 2024-05-07 18:42:50 +00:00
Peter Boyle
cd52e3cbc2 Jobs on subspot 2024-05-07 18:38:15 +00:00
Peter Boyle
24602e1259 Accidental synchronise 2024-05-07 17:28:38 +00:00
Peter Boyle
8a098889fc Update FlightRecorder.cc 2024-04-30 21:15:08 +01:00
Peter Boyle
ff2ea5de18 Update Tensor_traits.h 2024-04-11 14:25:45 -04:00
Peter Boyle
da59379612 Large reg file for double 2024-03-26 17:03:20 +00:00
Peter Boyle
3ef2a41518 ifdef guard ommitted 2024-03-26 14:50:32 +00:00
Peter Boyle
aa96f420c6 Acclerator ware MPI guard on the Unix domain sockets 2024-03-26 14:41:25 +00:00
Peter Boyle
49e9e4ed0e Fences 2024-03-26 14:14:06 +00:00
Peter Boyle
f7b8163016 Deterministic MPI reduce options 2024-03-26 14:11:40 +00:00
Peter Boyle
93769eacd3 Updated configure for bounce through host 2024-03-26 14:10:24 +00:00
Peter Boyle
59b0cc11df REduce the time in single 2024-03-26 00:42:40 +00:00
Peter Boyle
f32c275376 Updated config options for MPI not being aware of GPU 2024-03-26 00:42:00 +00:00
Peter Boyle
5404fc66ab Merge needs a fence on SYCL 2024-03-26 00:38:41 +00:00
Peter Boyle
1f53458af8 Options to bounce through a host buffer if
--disable-accelerator-aware-mpi
2024-03-26 00:37:19 +00:00
Peter Boyle
434c3e7f1d We have a choice of GET or PUT across NVlink 2024-03-25 14:32:44 +00:00
Peter Boyle
500b119f3d Deterministic MPI 2024-03-22 15:55:23 +00:00
Peter Boyle
4b87259c1b New config command for sunspot 2024-03-22 15:43:49 +00:00
Peter Boyle
503dec34ef This appears working now on Sunspot 2024-03-22 15:43:30 +00:00
Peter Boyle
d1e9fe50d2 Xor csum for repro testing 2024-03-22 15:42:57 +00:00
Peter Boyle
d01e5fa838 Improved FlightRecorder 2024-03-22 15:42:32 +00:00
Peter Boyle
a477c25e8c Sunspot repro tests 2024-03-22 15:42:11 +00:00
Peter Boyle
1bd20cd9e8 FlightRecorder 2024-03-22 15:40:01 +00:00
Peter Boyle
e49e95b037 Upgrade of the Britney test with flight recorder and fast xor checksum 2024-03-22 15:39:27 +00:00
Peter Boyle
6f59fed563 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:32 +00:00
Peter Boyle
60b7f6c99d Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:32:26 +00:00
Peter Boyle
b92dfcc8d3 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:27 +00:00
Peter Boyle
f6fd6dd053 Flight recorder, resurrecting the "world famous" Britney test 2024-03-22 15:30:01 +00:00
Peter Boyle
79ad567dd5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-19 15:43:42 +00:00
Peter Boyle
fab1efb48c More britney logging improvements 2024-03-19 14:36:21 +00:00
Peter Boyle
660eb76d93 FFTW from OneAPI 2024-03-19 14:28:33 +00:00
dbollweg
461cd045c6 sliceSum cleanup 2024-03-13 18:18:44 -04:00
dbollweg
fee65d7a75 Merge branch 'paboyle:develop' into sycl_slicesum_update 2024-03-13 18:06:17 -04:00
dbollweg
31f9971dbf avoid PI_ERROR_OUT_OF_RESOURCES in sycl sliceSum 2024-03-13 13:39:26 -04:00
Peter Boyle
62e7bf024a Updated flight logging for Britney test 2024-03-12 20:10:04 +00:00
Peter Boyle
95f3d69cf9 Extra hardware test hook 2024-03-12 20:09:37 +00:00
89c0519f83 Repro test 2024-03-12 16:11:33 +00:00
2704b82084 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-12 15:16:24 +00:00
cf8632bbac Britney test option 2024-03-12 15:15:35 +00:00
d224297972 PBS scripts 2024-03-12 15:15:16 +00:00
Peter Boyle
a4d11a630f Merge pull request #458 from paboyle/fix/HOST_NAME_MAX
fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined
2024-03-07 07:50:25 -05:00
2b4399f8b1 more HOST_NAME_MAX fix 2024-03-07 15:26:01 +09:00
f17b8de907 fallback to _POSIX_HOST_NAME_MAX if HOST_NAME_MAX is not defined 2024-03-07 15:22:08 +09:00
dbollweg
d87296f3e8 Merge branch 'develop' of https://github.com/dbollweg/Grid into develop 2024-03-06 16:54:22 -05:00
dbollweg
be94cf1c6f Fewer wait-calls in sycl slicesum 2024-03-06 16:53:13 -05:00
Peter Boyle
7e5bd46dd3 Booster update 2024-03-06 19:03:45 +01:00
Peter Boyle
228bbb9d81 Benchmark results 2024-03-06 19:03:35 +01:00
b812a7b4c6 Staggered launch script 2024-03-06 01:32:40 +00:00
891a366f73 Repro CG script 2024-03-06 01:22:55 +00:00
10116b3be8 Force device copyable and tell SYCL to shut it. 2024-03-06 01:13:27 +00:00
a46a0f0882 force device copyable and don't take crap from SYCL 2024-03-06 01:12:49 +00:00
a26a8a38f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-03-06 00:05:00 +00:00
7435315d50 More blasted shell variables 2024-03-06 00:03:59 +00:00
9b5f741e85 Reproducing CG can be more useful now 2024-03-06 00:03:16 +00:00
517822fdd2 SPR HBM benchmarking right and also PVC batched GEMM 2024-03-06 00:02:27 +00:00
1b93a9be88 Print out the hostname 2024-03-06 00:01:58 +00:00
783a66b348 Deterministic reduction please 2024-03-06 00:01:37 +00:00
976c3e9b59 Hack for flight logging CG inner products.
Can be made to work, but could put in some more serious infrastructure
for repro testing and blame attribution (Britney test) if necessary
2024-03-05 23:59:57 +00:00
f8ca971dae Use of a bare PRECISION macro is not namespace safe and collides with
SYCL
2024-03-05 23:59:13 +00:00
21bc8c24df OneMKL batched blas starting 2024-03-05 23:58:20 +00:00
30228214f7 SYCL conflict with Eigen 2024-03-05 23:56:10 +00:00
Peter Boyle
2ae980ae43 Update sourceme.sh 2024-03-05 13:39:18 -05:00
Peter Boyle
6153dec2e4 Update setup.sh 2024-03-05 13:38:32 -05:00
Peter Boyle
c805f86343 USQCD benchmark 2024-03-01 00:05:04 -05:00
Peter Boyle
04ca065281 Only one rank opens 2024-02-29 20:09:11 -05:00
Peter Boyle
88d8fa43d7 Benchmark development 2024-02-29 20:01:44 -05:00
Peter Boyle
3c49762875 Propagate in the blas routine 2024-02-29 15:33:06 -05:00
Peter Boyle
436bf1d9d3 Merge pull request #455 from clarkedavida/hisq_fat_links
Hisq fat links
2024-02-29 15:29:39 -05:00
david clarke
f70df6e195 changed NO_SHIFT and BACKWARD_CONST from define to enum 2024-02-29 12:29:30 -07:00
Peter Boyle
fce3852dff Merge pull request #451 from paboyle/feature/eigen-3.4.0-update
updating Eigen to 3.4.0
2024-02-28 18:03:37 -05:00
Peter Boyle
ee1b8bbdbd Merge pull request #454 from edbennett/adjoint-broke
fix HMC for non-fundamental representations
2024-02-28 14:05:27 -05:00
Peter Boyle
3f1636637d Merge pull request #453 from dbollweg/feature/sliceSum_gpu
Feature/slice sum gpu
2024-02-28 14:04:43 -05:00
Peter Boyle
2e570f5300 Merge pull request #457 from lehner/feature/gpt
Import GPT-related updates
2024-02-28 13:59:04 -05:00
Christoph Lehner
9f89486df5 remove unnecessary code path 2024-02-28 19:56:23 +01:00
Christoph Lehner
22b43b86cb Make GPT test suite work with SYCL 2024-02-28 12:57:17 +01:00
dbollweg
3c9012676a CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case. 2024-02-27 12:41:45 -05:00
Dennis Bollweg
b507fe209c Added SpinColourMatrix case to sliceSum Test 2024-02-27 11:28:32 -05:00
Dennis Bollweg
6cd2d8fcd5 Replace cuda/hip memcpy with Grid functions 2024-02-26 09:55:07 -05:00
david clarke
b02d022993 fixed race condition (thx michael) 2024-02-23 17:14:28 -07:00
david clarke
94581e3c7a accelerator_for is broken 2024-02-23 15:58:33 -07:00
david clarke
88b52cc045 Merge branch 'develop' into hisq_fat_links 2024-02-23 14:47:15 -07:00
dbollweg
0a816b5509 Merge branch 'feature/sliceSum_gpu' of https://github.com/dbollweg/Grid into feature/sliceSum_gpu 2024-02-22 21:43:06 -05:00
dbollweg
1c8b807c2e free malloc'd memory 2024-02-22 21:42:44 -05:00
Christoph Lehner
66391f84f2 Merge branch 'feature/gpt' of ../Grid into develop 2024-02-21 19:05:00 +01:00
97f7a9ecb3 fix HMC for non-fundamental representations 2024-02-21 08:27:55 +00:00
Dennis Bollweg
15878f7613 sliceSumReduction_cub_large now also faster than CPU on Frontier 2024-02-16 13:55:21 -05:00
dbollweg
e0d5e3c6c7 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-16 13:16:37 -05:00
dbollweg
6f3455900e Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs 2024-02-16 13:15:02 -05:00
david clarke
56827d6ad6 accelerator_inline bug 2024-02-14 13:56:57 -07:00
73c0b29535 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-02-13 20:19:32 +00:00
303b83cdb8 Scaling benchmarks, verbosity and MPICH aware in acceleratorInit()
For some reason Dirichlet benchmark fails on several nodes; need to
debug this.
2024-02-13 19:48:03 +00:00
5ef4da3f29 Silence verbose 2024-02-13 19:47:36 +00:00
1502860004 Benchmark scripts 2024-02-13 19:47:02 +00:00
585efc6f3f More benchmark scripts 2024-02-13 19:40:49 +00:00
62055e04dd missing semicolon generates error with some compilers 2024-02-13 18:18:27 +01:00
e4a641b64e removing old Eigen tensor patch 2024-02-13 10:37:14 +01:00
8849f187f1 updating Eigen to 3.4.0 2024-02-13 10:30:22 +01:00
david clarke
db420525b3 fix Simd::Nsimd typo 2024-02-12 15:03:53 -07:00
dbollweg
b5659d106e more test cases 2024-02-09 13:37:14 -05:00
dbollweg
4b43307402 Undo include path changes for level zero api header 2024-02-09 13:07:56 -05:00
dbollweg
09af8c25a2 Merge branch 'paboyle:develop' into feature/sliceSum_gpu 2024-02-09 13:02:59 -05:00
dbollweg
9514035b87 refactor slicesum: slicesum uses GPU version by default now 2024-02-09 13:02:28 -05:00
david clarke
2da09ae99b acceleration compiles and doesn't break scalar mode 2024-02-06 18:40:13 -07:00
david clarke
a38fb0e04a first effort toward accelerators 2024-02-06 18:24:55 -07:00
7019916294 RNG seed change safer for large volumes; this is a long term solution 2024-02-07 00:56:39 +00:00
dbollweg
1514b4f137 slicesum_sycl passes test 2024-02-06 19:08:44 -05:00
91cf5ee312 Updated bench script 2024-02-06 23:45:10 +00:00
david clarke
0a6e2f42c5 small amount of cleanup 2024-02-06 16:32:07 -07:00
dbollweg
ab2de131bd work towards sliceSum for sycl backend 2024-02-06 13:24:45 -05:00
5bfa88be85 Aurora MPI standalone benchmake and options that work well 2024-02-06 16:28:40 +00:00
Dennis Bollweg
5af8da76d7 Fix cuda compilation of Lattice_slicesum_gpu.h 2024-02-01 18:02:30 -05:00
Dennis Bollweg
b8b9dc952d Async memcpy's and cleanup 2024-02-01 17:55:35 -05:00
Dennis Bollweg
79a6ed32d8 Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies 2024-02-01 16:41:03 -05:00
dbollweg
caa5f97723 Add sliceSum gpu using cub/hipcub 2024-01-31 16:50:06 -05:00
david clarke
4924b3209e projectU3 yields a unitary matrix 2024-01-23 14:43:58 -07:00
david clarke
00f24f8765 already found some bugs in projection, still needs testing 2024-01-22 05:50:16 -07:00
david clarke
f5b3d582b0 first attempt at U3 projection 2024-01-22 02:49:40 -07:00
david clarke
981c93d67a update Test_fatLinks to accept Naik 2024-01-21 21:09:19 -07:00
david clarke
c020b78e02 Merge branch 'develop' into hisq_fat_links 2024-01-21 20:21:08 -07:00
2a0d75bac2 Aurora files 2023-12-21 23:20:17 +00:00
Peter Boyle
f48298ad4e Bug fix 2023-12-11 20:57:02 -05:00
root
645e47c1ba Config for Ampere Altra ARM 2023-12-08 16:17:56 -05:00
Peter Boyle
d1d9827263 Integrator logging update 2023-12-08 12:14:00 -05:00
Peter Boyle
14643c0aab SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) 2023-12-04 15:45:57 -05:00
Peter Boyle
b77a9b8947 SDDC compiles starting 2023-11-30 14:31:51 -05:00
Peter Boyle
7d077fe493 Frontier compiel 2023-11-09 13:58:44 -05:00
david clarke
9cd4128833 fix naik bug 2023-11-03 14:11:38 -06:00
david clarke
c8b17c9526 Naik to CShift 2023-11-02 12:43:22 -06:00
david clarke
2ae2a81e85 attempt to fix Naik 2023-10-31 13:54:55 -06:00
david clarke
69c869d345 fixed stupid typo 2023-10-30 17:41:52 -06:00
david clarke
df9b958c40 naik now returns separately 2023-10-30 17:40:53 -06:00
david clarke
3d3376d1a3 LePage works, trying Naik 2023-10-27 16:26:31 -06:00
Christoph Lehner
f2648e94b9 getHostPointer added to Lattice 2023-10-23 13:47:41 +02:00
david clarke
21ed6ac0f4 added floating-point support 2023-10-20 13:54:26 -06:00
david clarke
7bb8ab7000 improve smearing templating 2023-10-20 08:41:02 -06:00
david clarke
2c824c2641 Merge branch 'develop' into hisq_fat_links 2023-10-17 16:03:59 -06:00
david clarke
391fd9cc6a try lepage term 2023-10-17 14:57:15 -06:00
Peter Boyle
51051df62c 3GeV run setup 2023-10-16 20:49:52 +03:00
Peter Boyle
33097681b9 FTHMC compiled and merged to develop 2023-10-14 00:42:55 +03:00
Peter Boyle
07e4900218 FTHMC commit 2023-10-13 18:21:57 +03:00
Peter Boyle
36ab567d67 FTHMC 3 Gev 2023-10-13 18:21:57 +03:00
Peter Boyle
e19171523b FTHMC Status at lattice conference commit 2023-10-13 18:21:56 +03:00
Peter Boyle
9626a2c7c0 Asynch handling 2023-10-13 18:21:56 +03:00
Peter Boyle
e936f5b80b IfGridTensor shorthand 2023-10-13 18:21:56 +03:00
Peter Boyle
ffc0639cb9 Running in HMC tests 2023-10-13 18:21:56 +03:00
Peter Boyle
c5b43b322c traceProduct eliminates non-contributing intermediate terms 2023-10-13 18:21:56 +03:00
Peter Boyle
c9c4576237 Improved frontier cshift 2023-10-13 18:21:56 +03:00
david clarke
bf4369f72d clean up HISQSmear with decltypes 2023-10-12 12:41:06 -06:00
david clarke
36600899e2 working 7-link; Grid_log; generalShift 2023-10-12 11:11:39 -06:00
david clarke
b9c70d156b Merge branch 'develop' into hisq_fat_links 2023-10-10 22:44:17 -06:00
david clarke
eb89579fe7 Merge remote-tracking branch 'origin/develop' into develop 2023-10-10 22:43:51 -06:00
david clarke
0cfd13d18b 7-link working 2023-10-10 22:41:52 -06:00
Christoph Lehner
e6ed516052 merged 2023-10-08 09:00:37 +02:00
Christoph Lehner
e2a3dae1f2 Option for multiple simultaneous CartesianStencils 2023-10-08 08:58:44 +02:00
david clarke
63d9b8e8a3 Merge remote-tracking branch 'origin/develop' into hisq_fat_links 2023-09-16 23:20:31 -06:00
david clarke
d247031c98 try 7-link 2023-09-16 23:18:16 -06:00
david clarke
affff3865f Merge branch 'develop' into hisq_fat_links 2023-08-11 23:08:04 -06:00
david clarke
9c22655b5a Merge remote-tracking branch 'origin/develop' into develop 2023-08-11 23:06:42 -06:00
david clarke
99d879ea7f 5-link first attempt 2023-08-11 22:56:30 -06:00
david clarke
9d263d9a7d fix bug in HISQSmearing; move benchmark b/c i don't understand how makefiles work 2023-06-28 10:05:34 -06:00
david clarke
9015c229dc add benchmark to see whether matrix multiplication is slower than read from object 2023-06-27 21:28:26 -06:00
david clarke
a7eabaad56 rudimentary appendShift convenience method, which allows the user to append an arbitrary shift in one line 2023-06-26 23:59:28 -06:00
david clarke
eeb4703b84 develop wrappers to make the stencils easier to construct 2023-06-26 17:45:35 -06:00
david clarke
a07421b3d3 Merge branch 'develop' into hisq_fat_links 2023-06-26 13:51:32 -06:00
david clarke
cda53b4068 Merge remote-tracking branch 'origin/develop' into develop 2023-06-26 13:51:06 -06:00
david clarke
df99f227c1 include missing staple orientations; invert path direction, which was backwards 2023-06-22 14:57:10 -06:00
david clarke
d536c67b9d add HISQSmearing to Smearing.h 2023-06-20 16:04:48 -06:00
david clarke
f44f005dad rename _lvl1 --> _linkTreatment 2023-06-20 15:48:27 -06:00
david clarke
26b2caf570 add template parameter to Smear_HISQ_fat for MILC interfacing 2023-06-20 15:37:54 -06:00
david clarke
8bb078db25 Merge branch 'develop' into hisq_fat_links 2023-06-20 13:05:00 -06:00
david clarke
b61ba40023 Merge remote-tracking branch 'origin/develop' into develop 2023-06-20 13:04:53 -06:00
Christoph Lehner
452bf2e907 Accelerator basisRotate also on HIP 2023-06-20 20:36:24 +03:00
david clarke
14d352ea4f added smearParams struct 2023-06-12 16:55:44 -06:00
david clarke
1cf9ec1cce now compiles 2023-06-09 16:27:45 -06:00
david clarke
4b994a1bc7 trouble with compilation 2023-06-08 17:37:25 -06:00
david clarke
e506d6d369 Merge branch 'develop' into hisq_fat_links 2023-06-07 21:16:20 -06:00
david clarke
ab56ad8d7a fix 3-link stencil 2023-06-07 21:14:58 -06:00
Christoph Lehner
e8c29e2fe5 Merge pull request #31 from paboyle/develop
Sync
2023-05-28 16:13:12 +02:00
david clarke
3825329f8e Merge branch 'develop' into hisq_fat_links 2023-05-24 15:37:25 -06:00
david clarke
c7bdf2c0e4 3-link test at least gives an answer 2023-05-21 04:33:20 -06:00
Christoph Lehner
da9cbfc7cc Suppress BuildSurfaceList verbosity in Stencil.h 2023-05-19 20:22:20 +02:00
Christoph Lehner
6b9f07c1ed Merge pull request #30 from paboyle/develop
Merge upstream
2023-05-19 20:20:58 +02:00
david clarke
bf91778550 verbose plaquette example; fat link test frame 2023-05-17 15:15:54 -06:00
Christoph Lehner
5f75735dab Add M and Mdag to WilsonTMFermion 2023-04-06 18:25:05 +02:00
112 changed files with 7403 additions and 579 deletions

4
.gitignore vendored
View File

@@ -1,3 +1,7 @@
# Doxygen stuff
html/*
latex/*
# Compiled Object files #
#########################
*.slo

View File

@@ -34,7 +34,7 @@
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#ifdef USE_MKL
#if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h>
#else
#include <fftw3.h>

View File

@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to PRECISION for external use only */
/* Converting everything to ZOLO_PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
}
zolotarev_data* higham(PRECISION epsilon, int n) {
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
@@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
#ifdef TEST
#undef ZERO
#define ZERO ((PRECISION) 0)
#define ZERO ((ZOLO_PRECISION) 0)
#undef ONE
#define ONE ((PRECISION) 1)
#define ONE ((ZOLO_PRECISION) 1)
#undef TWO
#define TWO ((PRECISION) 2)
#define TWO ((ZOLO_PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R;
ZOLO_PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
@@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
/* Evaluate the rational approximation R(x) using the partial fraction form */
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> alpha[rdata -> da - 1];
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> beta[0] * x;
ZOLO_PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION T;
ZOLO_PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
PRECISION y;
ZOLO_PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
}
rdata = type == 2
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
? higham((ZOLO_PRECISION) eps, n)
: zolotarev((ZOLO_PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((PRECISION) x, rdata);
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));

View File

@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
#ifndef ZOLO_PRECISION
#define ZOLO_PRECISION double
#endif
#define ZPRECISION PRECISION
#define ZPRECISION ZOLO_PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
@@ -77,8 +77,8 @@ typedef struct {
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
@@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_BEGIN(Grid);
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,727 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#ifdef GRID_HIP
#include <hipblas/hipblas.h>
#endif
#ifdef GRID_CUDA
#include <cublas_v2.h>
#endif
#ifdef GRID_SYCL
#include <oneapi/mkl.hpp>
#endif
#if 0
#define GRID_ONE_MKL
#endif
#ifdef GRID_ONE_MKL
#include <oneapi/mkl.hpp>
#endif
///////////////////////////////////////////////////////////////////////
// Need to rearrange lattice data to be in the right format for a
// batched multiply. Might as well make these static, dense packed
///////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP
typedef hipblasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_CUDA
typedef cublasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t;
#endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
class GridBLAS {
public:
static gridblasHandle_t gridblasHandle;
static int gridblasInit;
static void Init(void)
{
if ( ! gridblasInit ) {
#ifdef GRID_CUDA
std::cout << "cublasCreate"<<std::endl;
cublasCreate(&gridblasHandle);
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
#endif
#ifdef GRID_HIP
std::cout << "hipblasCreate"<<std::endl;
hipblasCreate(&gridblasHandle);
#endif
#ifdef GRID_SYCL
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
cl::sycl::cpu_selector selector;
cl::sycl::device selectedDevice { selector };
gridblasHandle =new sycl::queue (selectedDevice);
#endif
gridblasInit=1;
}
}
// Force construct once
GridBLAS() { Init(); };
~GridBLAS() { };
/////////////////////////////////////////////////////////////////////////////////////
// BLAS GEMM conventions:
/////////////////////////////////////////////////////////////////////////////////////
// - C = alpha A * B + beta C
// Dimensions:
// - C_m.n
// - A_m.k
// - B_k.n
// - Flops = 8 M N K
// - Bytes = 2*sizeof(word) * (MN+MK+KN)
// M=60, N=12
// Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
/////////////////////////////////////////////////////////////////////////////////////
void synchronise(void)
{
#ifdef GRID_HIP
auto err = hipDeviceSynchronize();
assert(err==hipSuccess);
#endif
#ifdef GRID_CUDA
auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess);
#endif
#ifdef GRID_SYCL
accelerator_barrier();
#endif
#ifdef GRID_ONE_MKL
gridblasHandle->wait();
#endif
}
void gemmBatched(int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexD> alpha_p(1);
static deviceVector<ComplexD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
RealD t0=usecond();
// std::cout << "ZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex **)&Amk[0], lda,
(hipblasDoubleComplex **)&Bkn[0], ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex **)&Amk[0], lda,
(cuDoubleComplex **)&Bkn[0], ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
// synchronise();
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
// std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexF> alpha_p(1);
static deviceVector<ComplexF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex **)&Amk[0], lda,
(hipblasComplex **)&Bkn[0], ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex **)&Amk[0], lda,
(cuComplex **)&Bkn[0], ldb,
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
ComplexF alphaf(real(alpha),imag(alpha));
ComplexF betaf(real(beta),imag(beta));
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexF c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Single precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealF> alpha_p(1);
static deviceVector<RealF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealD> alpha_p(1);
static deviceVector<RealD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasDgemmBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasDgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
/*
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
onemkl::transpose::N,
onemkl::transpose::N,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
1,&batchCount64);
*/
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Strided case used by benchmark, but generally unused in Grid
// Keep a code example in double complex, but don't generate the single and real variants for now
////////////////////////////////////////////////////////////////////////////////////////////////
void gemmStridedBatched(int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // pointer list to matrices
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn,
int batchCount)
{
// Use C-row major storage, so transpose calls
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
int sda = m*k;
int sdb = k*n;
int sdc = m*n;
deviceVector<ComplexD> alpha_p(1);
deviceVector<ComplexD> beta_p(1);
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
#ifdef GRID_HIP
auto err = hipblasZgemmStridedBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda, sda,
(hipblasDoubleComplex *) Bkn, ldb, sdb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc, sdc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasZgemmStridedBatched(gridblasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda, sda,
(cuDoubleComplex *) Bkn, ldb, sdb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc, sdc,
batchCount);
#endif
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
oneapi::mkl::transpose::N,
oneapi::mkl::transpose::N,
m,n,k,
alpha,
(const ComplexD *)Amk,lda,sda,
(const ComplexD *)Bkn,ldb,sdb,
beta,
(ComplexD *)Cmn,ldc,sdc,
batchCount);
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
}
}
}
#endif
}
double benchmark(int M, int N, int K, int BATCH)
{
int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH;
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
ComplexD alpha(1.0);
ComplexD beta (1.0);
RealD flops = 8.0*M*N*K*BATCH;
int ncall=10;
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemmStridedBatched(M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0], // m x n
BATCH);
}
synchronise();
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
};
NAMESPACE_END(Grid);

View File

@@ -176,6 +176,7 @@ template<class T> using cshiftAllocator = std::allocator<T>;
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid);

View File

@@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
return offbytes;
}
#undef NVLINK_GET // Define to use get instead of put DMA
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
#endif
}
if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
@@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
{
int nreq=list.size();
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);

View File

@@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
void * GlobalSharedMemory::HostCommBuf;
#endif
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
@@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
void *SharedMemory::HostBufferMalloc(size_t bytes){
void *ptr = (void *)host_heap_top;
host_heap_top += bytes;
host_heap_bytes+= bytes;
if (host_heap_bytes >= host_heap_size) {
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(host_heap_bytes<host_heap_size);
}
return ptr;
}
void SharedMemory::HostBufferFreeAll(void) {
host_heap_top =(size_t)HostCommBuf;
host_heap_bytes=0;
}
#endif
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;

View File

@@ -75,7 +75,9 @@ public:
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
#ifndef ACCELERATOR_AWARE_MPI
static void *HostCommBuf;
#endif
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
@@ -120,6 +122,13 @@ private:
size_t heap_bytes;
size_t heap_size;
#ifndef ACCELERATOR_AWARE_MPI
size_t host_heap_top; // set in free all
size_t host_heap_bytes;// set in free all
void *HostCommBuf; // set in SetCommunicator
size_t host_heap_size; // set in SetCommunicator
#endif
protected:
Grid_MPI_Comm ShmComm; // for barriers
@@ -151,7 +160,10 @@ public:
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
#ifndef ACCELERATOR_AWARE_MPI
void *HostBufferMalloc(size_t bytes);
void HostBufferFreeAll(void);
#endif
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////

View File

@@ -39,9 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCL
#ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#endif
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h>
@@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
@@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
HostCommBuf= malloc(bytes);
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
@@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#else
#ifdef GRID_MPI3_SHMMMAP
@@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
ShmBufferFreeAll();
#ifndef ACCELERATOR_AWARE_MPI
host_heap_size = heap_size;
HostCommBuf= GlobalSharedMemory::HostCommBuf;
HostBufferFreeAll();
#endif
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////

View File

@@ -29,8 +29,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
extern std::vector<std::pair<int,int> > Cshift_table;
extern commVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void)
{
// GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz);
}
acceleratorCopyToDevice((void *)&Cshift_table[0],
(void *)&Cshift_table_device[0],
sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map
}
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
@@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
}
{
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
@@ -225,7 +244,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
@@ -297,30 +316,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
}
}
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
@@ -345,20 +340,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0;
if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@@ -372,7 +359,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
{
auto table = &Cshift_table[0];
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
@@ -409,19 +396,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0;
if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@@ -432,7 +411,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}
{
auto table = &Cshift_table[0];
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);

View File

@@ -52,7 +52,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
RealD t1,t0;
t0=usecond();
if ( !comm_dim ) {
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@@ -63,6 +64,8 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
t1=usecond();
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
@@ -127,16 +130,20 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
@@ -144,26 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
tcomms-=usecond();
// grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
grid->Barrier();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -190,6 +210,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
@@ -227,7 +253,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
tgather-=usecond();
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
@@ -252,7 +280,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
tcomms-=usecond();
// grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
@@ -262,7 +291,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
recv_from_rank,
bytes);
grid->Barrier();
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
@@ -270,9 +301,17 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -292,6 +331,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
@@ -315,7 +359,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
@@ -324,7 +370,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
@@ -332,7 +380,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
@@ -340,13 +389,24 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -372,6 +432,11 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
@@ -414,8 +479,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
@@ -440,7 +507,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
@@ -449,17 +517,28 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
grid->Barrier();
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
*/
}
#endif
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,5 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
std::vector<std::pair<int,int> > Cshift_table;
commVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid);

View File

@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
@@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/PaddedCell.h>

View File

@@ -270,5 +270,42 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L
return axpby_norm_fast(ret,a,b,x,y);
}
/// Trace product
template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
-> Lattice<decltype(trace(obj()))>
{
typedef decltype(trace(obj())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( rhs2 , rhs_2, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
-> Lattice<decltype(trace(obj1()))>
{
typedef decltype(trace(obj1())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
-> Lattice<decltype(trace(obj1()))>
{
return traceProduct(rhs_1,rhs_2);
}
NAMESPACE_END(Grid);
#endif

View File

@@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( (!defined(GRID_CUDA)) )
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region

View File

@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
}
}
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@@ -31,6 +31,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
#include <Grid/lattice/Lattice_slicesum_core.h>
NAMESPACE_BEGIN(Grid);
@@ -280,11 +281,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
return nrm;
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
#ifdef GRID_SYCL
uint64_t csum=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// Hack
// Fast integer xor checksum. Can also be used in comms now.
autoView(l_v,left,AcceleratorRead);
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words);
}
FlightRecorder::CsumLog(csum);
#endif
ComplexD nrm = rankInnerProduct(left,right);
RealD local = real(nrm);
FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@@ -448,19 +467,10 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
int ostride=grid->_ostride[orthogdim];
//Reduce Data down to lvSum
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
@@ -504,6 +514,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{

View File

@@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
auto r=hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize;

View File

@@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
Word xorResult; xorResult = 0;
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
Word identity; identity=0;
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
sum ^=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
Word ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{

View File

@@ -152,6 +152,7 @@ public:
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
#if 0
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
@@ -162,9 +163,9 @@ public:
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
@@ -179,6 +180,9 @@ public:
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
#else
eng.discardhi(site);
#endif
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
@@ -407,7 +411,7 @@ public:
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@@ -424,7 +428,6 @@ public:
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
#if 1
thread_for( lidx, _grid->lSites(), {
int gidx;
@@ -445,29 +448,12 @@ public:
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
});
#else
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
if ( britney ) {
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
} else {
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
#endif
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient

View File

@@ -0,0 +1,224 @@
#pragma once
#if defined(GRID_CUDA)
#include <cub/cub.cuh>
#define gpucub cub
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#elif defined(GRID_HIP)
#include <hipcub/hipcub.hpp>
#define gpucub hipcub
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#endif
NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
size_t subvol_size = e1*e2;
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
vobj zero_init;
zeroit(zero_init);
void *temp_storage_array = NULL;
size_t temp_storage_bytes = 0;
vobj *d_out;
int* d_offsets;
std::vector<int> offsets(rd+1,0);
for (int i = 0; i < offsets.size(); i++) {
offsets[i] = i*subvol_size;
}
//Allocate memory for output and offset arrays on device
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
//allocate memory for temp_storage_array
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
//prepare buffer for reduction
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
//issue segmented reductions in computeStream
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();
acceleratorFreeDevice(temp_storage_array);
acceleratorFreeDevice(d_out);
acceleratorFreeDevice(d_offsets);
}
#endif
#if defined(GRID_SYCL)
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
size_t subvol_size = e1*e2;
vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator);
vobj vobj_zero;
zeroit(vobj_zero);
for (int r = 0; r<rd; r++) {
mysum[r] = vobj_zero;
}
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
// autoView(Data_v, Data, AcceleratorRead);
//prepare reduction buffer
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
Reduction,
[=](cl::sycl::id<1> item, auto &sum) {
auto s = item[0];
sum += rb_p[r*subvol_size+s];
});
});
}
theGridAccelerator->wait();
for (int r = 0; r < rd; r++) {
lvSum[r] = mysum[r];
}
free(mysum,*theGridAccelerator);
}
#endif
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2;
commVector<vector>buffer(osites);
vector *dat = (vector *)Data;
vector *buf = &buffer[0];
Vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
#elif defined(GRID_SYCL)
sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
#endif
for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r];
}
}
}
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#elif defined (GRID_SYCL)
sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
else {
sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
}
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*ostride; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
}
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
NAMESPACE_END(Grid);

View File

@@ -469,15 +469,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
vobj cd = zz;
vobj cd = Zero();
for(int sb=0;sb<blockVol;sb++){

View File

@@ -45,6 +45,7 @@ public:
};
// Host only
GridBase * getGrid(void) const { return _grid; };
vobj* getHostPointer(void) const { return _odata; };
};
/////////////////////////////////////////////////////////////////////////////////////////

View File

@@ -179,11 +179,11 @@ extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogDebug;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogIterative;
extern GridLogger GridLogIntegrator;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
@@ -191,6 +191,41 @@ extern Colours GridLogColours;
std::string demangle(const char* name) ;
template<typename... Args>
inline std::string sjoin(Args&&... args) noexcept {
std::ostringstream msg;
(msg << ... << args);
return msg.str();
}
/*! @brief make log messages work like python print */
template <typename... Args>
inline void Grid_log(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << GridLogMessage << msg << std::endl;
}
/*! @brief make warning messages work like python print */
template <typename... Args>
inline void Grid_warn(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
}
/*! @brief make error messages work like python print */
template <typename... Args>
inline void Grid_error(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
}
/*! @brief make pass messages work like python print */
template <typename... Args>
inline void Grid_pass(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
}
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@@ -34,7 +34,7 @@ class GridTracer {
};
inline void tracePush(const char *name) { roctxRangePushA(name); }
inline void tracePop(const char *name) { roctxRangePop(); }
inline int traceStart(const char *name) { roctxRangeStart(name); }
inline int traceStart(const char *name) { return roctxRangeStart(name); }
inline void traceStop(int ID) { roctxRangeStop(ID); }
#endif

View File

@@ -129,6 +129,22 @@ public:
virtual ~Action(){}
};
template <class GaugeField >
class EmptyAction : public Action <GaugeField>
{
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() { return std::string("Level Force Log"); };
virtual std::string LogParameters() { return std::string("No parameters");};
};
NAMESPACE_END(Grid);
#endif // ACTION_BASE_H

View File

@@ -63,7 +63,9 @@ public:
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
virtual void M(const FermionField &in, FermionField &out) ;
virtual void Mdag(const FermionField &in, FermionField &out) ;
private:
RealD mu; // TwistedMass parameter

View File

@@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
#ifndef GRID_CUDA
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
@@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}

View File

@@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#ifndef GRID_CUDA
@@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
autoView(st_v ,st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
#ifndef GRID_CUDA

View File

@@ -93,5 +93,25 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
RealD b = tm /sq;
axpibg5x(out,in,a,b);
}
template<class Impl>
void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid());
RealD a = 4.0+this->mass;
RealD b = this->mu;
axpibg5x(tmp,in,a,b);
axpy(out, 1.0, tmp, out);
}
template<class Impl>
void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerYes);
FermionField tmp(out.Grid());
RealD a = 4.0+this->mass;
RealD b = -this->mu;
axpibg5x(tmp,in,a,b);
axpy(out, 1.0, tmp, out);
}
NAMESPACE_END(Grid);

View File

@@ -87,6 +87,8 @@ public:
const ActionSet<Field, RepresentationPolicy> as;
ActionSet<Field,RepresentationPolicy> LevelForces;
//Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){
static MomentumFilterNone<MomentaField> filter;
@@ -124,6 +126,9 @@ public:
// input U actually not used in the fundamental case
// Fundamental updates, include smearing
assert(as.size()==LevelForces.size());
Field level_force(U.Grid()); level_force =Zero();
for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond();
@@ -144,7 +149,10 @@ public:
MomFilter->applyFilter(force);
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<< std::endl;
// track the total
level_force = level_force+force;
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
@@ -167,6 +175,16 @@ public:
}
{
// total force
Real force_abs = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
Real force_max = std::sqrt(maxLocalNorm2(level_force));
Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;
LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
}
// Force from the other representations
as[level].apply(update_P_hireps, Representations, Mom, U, ep);
@@ -216,6 +234,16 @@ public:
//Default the momentum filter to "do-nothing"
MomFilter = getDefaultMomFilter();
for (int level = 0; level < as.size(); ++level) {
int multiplier = as.at(level).multiplier;
ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
Level->push_back(new EmptyAction<Field>);
LevelForces.push_back(*Level);
// does it copy by value or reference??
// - answer it copies by value, BUT the action level contains a reference that is NOT updated.
// Unsafe code in Guido's area
}
};
virtual ~Integrator() {}
@@ -233,10 +261,14 @@ public:
void reset_timer(void)
{
assert(as.size()==LevelForces.size());
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
as[level].actions.at(actionID)->reset_timer();
}
int actionID=0;
assert(LevelForces.at(level).actions.size()==1);
LevelForces.at(level).actions.at(actionID)->reset_timer();
}
}
void print_timer(void)
@@ -298,6 +330,16 @@ public:
<<" calls " << as[level].actions.at(actionID)->deriv_num
<< std::endl;
}
int actionID=0;
std::cout << GridLogMessage
<< LevelForces[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] :\n\t\t "
<<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
<<" norm " << LevelForces[level].actions.at(actionID)->deriv_norm_average()
<<" Fdt max " << LevelForces[level].actions.at(actionID)->Fdt_max_average()
<<" Fdt norm " << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
<<" calls " << LevelForces[level].actions.at(actionID)->deriv_num
<< std::endl;
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
@@ -319,6 +361,13 @@ public:
std::cout << as[level].actions.at(actionID)->LogParameters();
}
}
std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
for (int level = 0; level < LevelForces.size(); ++level) {
std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
}
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
@@ -400,6 +449,7 @@ public:
RealD S(Field& U)
{ // here also U not used
assert(as.size()==LevelForces.size());
std::cout << GridLogIntegrator << "Integrator action\n";
RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom

View File

@@ -1,3 +1,4 @@
/*!
@file GaugeConfiguration.h
@brief Declares the GaugeConfiguration class
@@ -6,6 +7,15 @@
NAMESPACE_BEGIN(Grid);
template<class T> void Dump(const Lattice<T> & lat,
std::string s,
Coordinate site = Coordinate({0,0,0,0}))
{
typename T::scalar_object tmp;
peekSite(tmp,lat,site);
std::cout << " Dump "<<s<<" "<<tmp<<std::endl;
}
/*!
@brief Smeared configuration masked container
Modified for a multi-subset smearing (aka Luscher Flowed HMC)
@@ -28,6 +38,101 @@ private:
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
typedef typename SU3Adjoint::LatticeAdjVector AdjVectorField;
void BaseSmearDerivative(GaugeField& SigmaTerm,
const GaugeField& iLambda,
const GaugeField& U,
int mmu, RealD rho)
{
// Reference
// Morningstar, Peardon, Phys.Rev.D69,054501(2004)
// Equation 75
// Computing Sigma_mu, derivative of S[fat links] with respect to the thin links
// Output SigmaTerm
GridBase *grid = U.Grid();
WilsonLoops<Gimpl> WL;
GaugeLinkField staple(grid), u_tmp(grid);
GaugeLinkField iLambda_mu(grid), iLambda_nu(grid);
GaugeLinkField U_mu(grid), U_nu(grid);
GaugeLinkField sh_field(grid), temp_Sigma(grid);
Real rho_munu, rho_numu;
rho_munu = rho;
rho_numu = rho;
for(int mu = 0; mu < Nd; ++mu){
U_mu = peekLorentz( U, mu);
iLambda_mu = peekLorentz(iLambda, mu);
for(int nu = 0; nu < Nd; ++nu){
if(nu==mu) continue;
U_nu = peekLorentz( U, nu);
// Nd(nd-1) = 12 staples normally.
// We must compute 6 of these
// in FTHMC case
if ( (mu==mmu)||(nu==mmu) )
WL.StapleUpper(staple, U, mu, nu);
if(nu==mmu) {
iLambda_nu = peekLorentz(iLambda, nu);
temp_Sigma = -rho_numu*staple*iLambda_nu; //ok
//-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x)
Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity?
temp_Sigma = rho_numu*sh_field*staple; //ok
//r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)
Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
}
if ( mu == mmu ) {
sh_field = Cshift(iLambda_mu, nu, 1);
temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok
//-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x)
Gimpl::AddLink(SigmaTerm, temp_Sigma, mu);
}
// staple = Zero();
sh_field = Cshift(U_nu, mu, 1);
temp_Sigma = Zero();
if ( mu == mmu )
temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu;
if ( nu == mmu ) {
temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu;
u_tmp = adj(U_nu)*iLambda_nu;
sh_field = Cshift(u_tmp, mu, 1);
temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu;
}
sh_field = Cshift(temp_Sigma, nu, -1);
Gimpl::AddLink(SigmaTerm, sh_field, mu);
}
}
}
void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) {
GridBase *grid = U.Grid();
GaugeLinkField tmp_stpl(grid);
WilsonLoops<Gimpl> WL;
Cup = Zero();
for(int nu=0; nu<Nd; ++nu){
if (nu != mu) {
// get the staple in direction mu, nu
WL.Staple(tmp_stpl, U, mu, nu); //nb staple conventions of IroIro and Grid differ by a dagger
Cup += adj(tmp_stpl*rho);
}
}
}
// Adjoint vector to GaugeField force
void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
{
@@ -47,27 +152,54 @@ private:
GaugeLinkField UtaU(PlaqL.Grid());
GaugeLinkField D(PlaqL.Grid());
AdjMatrixField Dbc(PlaqL.Grid());
AdjMatrixField Dbc_opt(PlaqL.Grid());
LatticeComplex tmp(PlaqL.Grid());
const int Ngen = SU3Adjoint::Dimension;
Complex ci(0,1);
ColourMatrix ta,tb,tc;
RealD t=0;
RealD tp=0;
RealD tta=0;
RealD tpk=0;
t-=usecond();
for(int a=0;a<Ngen;a++) {
tta-=usecond();
SU3::generator(a, ta);
ta = 2.0 * ci * ta;
// Qlat Tb = 2i Tb^Grid
UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
UtaU= adj(PlaqL)*ta*PlaqR; // 6ms
tta+=usecond();
////////////////////////////////////////////
// Could add this entire C-loop to a projection routine
// for performance. Could also pick checkerboard on UtaU
// and set checkerboard on result for 2x perf
////////////////////////////////////////////
for(int c=0;c<Ngen;c++) {
SU3::generator(c, tc);
D = Ta( (2.0)*ci*tc *UtaU);
tc = 2.0*ci*tc;
tp-=usecond();
D = Ta( tc *UtaU); // 2ms
#if 1
SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms
#else
for(int b=0;b<Ngen;b++){
SU3::generator(b, tb);
tmp =-trace(ci*tb*D);
PokeIndex<ColourIndex>(Dbc,tmp,b,c); // Adjoint rep
}
#endif
tp+=usecond();
}
tmp = trace(MpInvJx * Dbc);
// Dump(Dbc_opt,"Dbc_opt");
// Dump(Dbc,"Dbc");
tpk-=usecond();
tmp = trace(MpInvJx * Dbc_opt);
PokeIndex<ColourIndex>(Fdet2,tmp,a);
tpk+=usecond();
}
t+=usecond();
std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms proj "<<tp/1e3<< " ms"
<< " ta "<<tta/1e3<<" ms" << " poke "<<tpk/1e3<< " ms"<<std::endl;
}
void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
@@ -79,12 +211,17 @@ private:
ColourMatrix tc;
for(int b=0;b<Ngen;b++) {
SU3::generator(b, tb);
Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
tb = 2.0 * ci * tb;
Nx = Ta( adj(PlaqL)*tb * PlaqR );
#if 1
SU3::LieAlgebraProject(NxAd,Nx,b);
#else
for(int c=0;c<Ngen;c++) {
SU3::generator(c, tc);
auto tmp =closure( -trace(ci*tc*Nx));
PokeIndex<ColourIndex>(NxAd,tmp,c,b);
}
#endif
}
}
void ApplyMask(GaugeField &U,int smr)
@@ -164,8 +301,7 @@ public:
// Computes ALL the staples -- could compute one only and do it here
RealD time;
time=-usecond();
this->StoutSmearing->BaseSmear(C, U);
Cmu = peekLorentz(C, mu);
BaseSmear(Cmu, U,mu,rho);
//////////////////////////////////////////////////////////////////
// Assemble Luscher exp diff map J matrix
@@ -209,6 +345,36 @@ public:
// dJ(x)/dxe
//////////////////////////////////////
time=-usecond();
#if 1
std::vector<AdjMatrixField> dJdX; dJdX.resize(8,grid);
std::vector<AdjMatrix> TRb_s; TRb_s.resize(8);
AdjMatrixField tbXn(grid);
AdjMatrixField sumXtbX(grid);
AdjMatrixField t2(grid);
AdjMatrixField dt2(grid);
AdjMatrixField t3(grid);
AdjMatrixField dt3(grid);
AdjMatrixField aunit(grid);
for(int b=0;b<8;b++){
SU3Adjoint::generator(b, TRb_s[b]);
dJdX[b] = TRb_s[b];
}
aunit = ComplexD(1.0);
// Could put into an accelerator_for
X = (-1.0)*ZxAd;
t2 = X;
for (int j = 12; j > 1; --j) {
t3 = t2*(1.0 / (j + 1)) + aunit;
t2 = X * t3;
for(int b=0;b<8;b++){
dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1));
}
}
for(int b=0;b<8;b++){
dJdX[b] = -dJdX[b];
}
#else
std::vector<AdjMatrixField> dJdX; dJdX.resize(8,grid);
AdjMatrixField tbXn(grid);
AdjMatrixField sumXtbX(grid);
@@ -224,14 +390,15 @@ public:
X = (-1.0)*ZxAd;
t2 = X;
dt2 = TRb;
for (int j = 20; j > 1; --j) {
t3 = t2*(1.0 / (j + 1)) + aunit;
for (int j = 12; j > 1; --j) {
t3 = t2*(1.0 / (j + 1)) + aunit;
dt3 = dt2*(1.0 / (j + 1));
t2 = X * t3;
dt2 = TRb * t3 + X * dt3;
}
dJdX[b] = -dt2;
}
#endif
time+=usecond();
std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
/////////////////////////////////////////////////////////////////
@@ -281,8 +448,8 @@ public:
for(int e =0 ; e<8 ; e++){
LatticeComplexD tr(grid);
ColourMatrix te;
SU3::generator(e, te);
// ColourMatrix te;
// SU3::generator(e, te);
tr = trace(dJdX[e] * nMpInv);
pokeColour(dJdXe_nMpInv,tr,e);
}
@@ -493,20 +660,25 @@ public:
//////////////////////////////////////////////////////////////////
// Assemble the N matrix
//////////////////////////////////////////////////////////////////
// Computes ALL the staples -- could compute one only here
this->StoutSmearing->BaseSmear(C, U);
Cmu = peekLorentz(C, mu);
double rho=this->StoutSmearing->SmearRho[1];
BaseSmear(Cmu, U,mu,rho);
Umu = peekLorentz(U, mu);
Complex ci(0,1);
for(int b=0;b<Ngen;b++) {
SU3::generator(b, Tb);
// Qlat Tb = 2i Tb^Grid
Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
// FIXME -- replace this with LieAlgebraProject
#if 0
SU3::LieAlgebraProject(Ncb,tmp,b);
#else
for(int c=0;c<Ngen;c++) {
SU3::generator(c, Tc);
auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
PokeIndex<ColourIndex>(Ncb,tmp,c,b);
}
#endif
}
//////////////////////////////////////////////////////////////////
@@ -693,15 +865,19 @@ private:
const GaugeField& GaugeK,int level)
{
GridBase* grid = GaugeK.Grid();
GaugeField C(grid), SigmaK(grid), iLambda(grid);
GaugeField SigmaK(grid), iLambda(grid);
GaugeField SigmaKPrimeA(grid);
GaugeField SigmaKPrimeB(grid);
GaugeLinkField iLambda_mu(grid);
GaugeLinkField iQ(grid), e_iQ(grid);
GaugeLinkField SigmaKPrime_mu(grid);
GaugeLinkField GaugeKmu(grid), Cmu(grid);
this->StoutSmearing->BaseSmear(C, GaugeK);
int mmu= (level/2) %Nd;
int cb= (level%2);
double rho=this->StoutSmearing->SmearRho[1];
// Can override this to do one direction only.
SigmaK = Zero();
iLambda = Zero();
@@ -712,18 +888,38 @@ private:
// Could get away with computing only one polarisation here
// int mu= (smr/2) %Nd;
// SigmaKprime_A has only one component
for (int mu = 0; mu < Nd; mu++)
#if 0
BaseSmear(Cmu, GaugeK,mu,rho);
GaugeKmu = peekLorentz(GaugeK, mu);
SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
iQ = Ta(Cmu * adj(GaugeKmu));
this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
pokeLorentz(iLambda, iLambda_mu, mu);
BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho); // derivative of SmearBase
#else
// GaugeField C(grid);
// this->StoutSmearing->BaseSmear(C, GaugeK);
// for (int mu = 0; mu < Nd; mu++)
int mu =mmu;
BaseSmear(Cmu, GaugeK,mu,rho);
{
Cmu = peekLorentz(C, mu);
// Cmu = peekLorentz(C, mu);
GaugeKmu = peekLorentz(GaugeK, mu);
SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
iQ = Ta(Cmu * adj(GaugeKmu));
this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
pokeLorentz(iLambda, iLambda_mu, mu);
std::cout << " mu "<<mu<<" SigmaKPrime_mu"<<norm2(SigmaKPrime_mu)<< " iLambda_mu " <<norm2(iLambda_mu)<<std::endl;
}
this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase
// GaugeField SigmaKcopy(grid);
// SigmaKcopy = SigmaK;
BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho); // derivative of SmearBase
// this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase
// SigmaKcopy = SigmaKcopy - SigmaK;
// std::cout << " BaseSmearDerivative fast path error" <<norm2(SigmaKcopy)<<std::endl;
#endif
////////////////////////////////////////////////////////////////////////////////////
// propagate the rest of the force as identity map, just add back
////////////////////////////////////////////////////////////////////////////////////

View File

@@ -0,0 +1,389 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/smearing/HISQSmearing.h
Copyright (C) 2023
Author: D. A. Clarke <clarke.davida@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/*
@file HISQSmearing.h
@brief Declares classes related to HISQ smearing
*/
#pragma once
#include <Grid/Grid.h>
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
NAMESPACE_BEGIN(Grid);
// TODO: find a way to fold this into the stencil header. need to access grid to get
// Nd, since you don't want to inherit from QCD.h
/*! @brief append arbitrary shift path to shifts */
template<typename... Args>
void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) {
Coordinate shift(Nd,0);
generalShift(shift, dir, args...);
// push_back creates an element at the end of shifts and
// assigns the data in the argument to it.
shifts.push_back(shift);
}
/*! @brief figure out the stencil index from mu and nu */
accelerator_inline int stencilIndex(int mu, int nu) {
// Nshifts depends on how you built the stencil
int Nshifts = 6;
return Nshifts*nu + Nd*Nshifts*mu;
}
/*! @brief structure holding the link treatment */
struct SmearingParameters{
SmearingParameters(){}
Real c_1; // 1 link
Real c_naik; // Naik term
Real c_3; // 3 link
Real c_5; // 5 link
Real c_7; // 7 link
Real c_lp; // 5 link Lepage
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: c_1(c1),
c_naik(cnaik),
c_3(c3),
c_5(c5),
c_7(c7),
c_lp(clp){}
};
/*! @brief create fat links from link variables */
template<class Gimpl>
class Smear_HISQ : public Gimpl {
private:
GridCartesian* const _grid;
SmearingParameters _linkTreatment;
public:
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Gimpl::GaugeField GF;
typedef typename Gimpl::GaugeLinkField LF;
typedef typename Gimpl::ComplexField CF;
// Don't allow default values here.
Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: _grid(grid),
_linkTreatment(c1,cnaik,c3,c5,c7,clp) {
assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
}
// Allow to pass a pointer to a C-style, double array for MILC convenience
Smear_HISQ(GridCartesian* grid, double* coeff)
: _grid(grid),
_linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
}
~Smear_HISQ() {}
// Intent: OUT--u_smr, u_naik
// IN--u_thin
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
SmearingParameters lt = this->_linkTreatment;
auto grid = this->_grid;
// Create a padded cell of extra padding depth=1 and fill the padding.
int depth = 1;
PaddedCell Ghost(depth,grid);
GF Ughost = Ghost.Exchange(u_thin);
// This is where auxiliary N-link fields and the final smear will be stored.
GF Ughost_fat(Ughost.Grid());
GF Ughost_3link(Ughost.Grid());
GF Ughost_5linkA(Ughost.Grid());
GF Ughost_5linkB(Ughost.Grid());
// mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier,
// but these entries will not be used.
std::vector<Coordinate> shifts;
for(int mu=0;mu<Nd;mu++)
for(int nu=0;nu<Nd;nu++) {
appendShift(shifts,mu);
appendShift(shifts,nu);
appendShift(shifts,shiftSignal::NO_SHIFT);
appendShift(shifts,mu,Back(nu));
appendShift(shifts,Back(nu));
appendShift(shifts,Back(mu));
}
// A GeneralLocalStencil has two indices: a site and stencil index
GeneralLocalStencil gStencil(Ughost.Grid(),shifts);
// This is where contributions from the smearing get added together
Ughost_fat=Zero();
// This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik.
for(int mu=0;mu<Nd;mu++) {
// TODO: This approach is slightly memory inefficient. It uses 25% extra memory
Ughost_3link =Zero();
Ughost_5linkA=Zero();
Ughost_5linkB=Zero();
// Create the accessors
autoView(U_v , Ughost , AcceleratorRead);
autoView(U_fat_v , Ughost_fat , AcceleratorWrite);
autoView(U_3link_v , Ughost_3link , AcceleratorWrite);
autoView(U_5linkA_v, Ughost_5linkA, AcceleratorWrite);
autoView(U_5linkB_v, Ughost_5linkB, AcceleratorWrite);
// We infer some types that will be needed in the calculation.
typedef decltype(gStencil.GetEntry(0,0)) stencilElement;
typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix;
int Nsites = U_v.size();
auto gStencil_v = gStencil.View();
accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs
stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
U3matrix U0, U1, U2, U3, U4, U5, W;
for(int nu=0;nu<Nd;nu++) {
if(nu==mu) continue;
int s = stencilIndex(mu,nu);
// The stencil gives us support points in the mu-nu plane that we will use to
// grab the links we need.
SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset;
SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset;
SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset;
SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset;
SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu = SE5->_offset;
// When you're deciding whether to take an adjoint, the question is: how is the
// stored link oriented compared to the one you want? If I imagine myself travelling
// with the to-be-updated link, I have two possible, alternative 3-link paths I can
// take, one starting by going to the left, the other starting by going to the right.
U0 = coalescedReadGeneralPermute(U_v[x_p_mu ](nu),SE0->_permute,Nd);
U1 = coalescedReadGeneralPermute(U_v[x_p_nu ](mu),SE1->_permute,Nd);
U2 = coalescedReadGeneralPermute(U_v[x ](nu),SE2->_permute,Nd);
U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
U4 = coalescedReadGeneralPermute(U_v[x_m_nu ](mu),SE4->_permute,Nd);
U5 = coalescedReadGeneralPermute(U_v[x_m_nu ](nu),SE4->_permute,Nd);
// "left" "right"
W = U2*U1*adj(U0) + adj(U5)*U4*U3;
// Save 3-link construct for later and add to smeared field.
coalescedWrite(U_3link_v[x](nu), W);
// The index operator (x) returns the coalesced read on GPU. The view [] index returns
// a reference to the vector object. The [x](mu) returns a reference to the densely
// packed (contiguous in memory) mu-th element of the vector object. On CPU,
// coalescedRead/Write is the identity mapping assigning vector object to vector object.
// But on GPU it's non-trivial and maps scalar object to vector object and vice versa.
coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W);
}
})
accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link
stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
U3matrix U0, U1, U2, U3, U4, U5, W;
int sigmaIndex = 0;
for(int nu=0;nu<Nd;nu++) {
if(nu==mu) continue;
int s = stencilIndex(mu,nu);
for(int rho=0;rho<Nd;rho++) {
if (rho == mu || rho == nu) continue;
SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset;
SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset;
SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset;
SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset;
U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd);
U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd);
U2 = coalescedReadGeneralPermute( U_v[x ](nu ),SE2->_permute,Nd);
U3 = coalescedReadGeneralPermute( U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd);
U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu ](rho),SE4->_permute,Nd);
U5 = coalescedReadGeneralPermute( U_v[x_m_nu ](nu ),SE4->_permute,Nd);
W = U2*U1*adj(U0) + adj(U5)*U4*U3;
if(sigmaIndex<3) {
coalescedWrite(U_5linkA_v[x](rho), W);
} else {
coalescedWrite(U_5linkB_v[x](rho), W);
}
coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W);
sigmaIndex++;
}
}
})
accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link
stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
U3matrix U0, U1, U2, U3, U4, U5, W;
int sigmaIndex = 0;
for(int nu=0;nu<Nd;nu++) {
if(nu==mu) continue;
int s = stencilIndex(mu,nu);
for(int rho=0;rho<Nd;rho++) {
if (rho == mu || rho == nu) continue;
SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset;
SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset;
SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset;
SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset;
U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd);
if(sigmaIndex<3) {
U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd);
} else {
U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd);
}
U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd);
U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
if(sigmaIndex<3) {
U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd);
} else {
U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd);
}
U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd);
W = U2*U1*adj(U0) + adj(U5)*U4*U3;
coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W);
sigmaIndex++;
}
}
})
} // end mu loop
// c1, c3, c5, c7 construct contributions
u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin;
// Load up U and V std::vectors to access thin and smeared links.
std::vector<LF> U(Nd, grid);
std::vector<LF> V(Nd, grid);
std::vector<LF> Vnaik(Nd, grid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(u_thin, mu);
V[mu] = PeekIndex<LorentzIndex>(u_smr, mu);
}
for(int mu=0;mu<Nd;mu++) {
// Naik
Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftIdentityForward(U[mu],mu)));
// LePage
for (int nu_h=1;nu_h<Nd;nu_h++) {
int nu=(mu+nu_h)%Nd;
// nu, nu, mu, Back(nu), Back(nu)
V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftBackward(U[nu],nu,
Gimpl::CovShiftIdentityBackward(U[nu],nu)))))
// Back(nu), Back(nu), mu, nu, nu
+ lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu,
Gimpl::CovShiftBackward(U[nu],nu,
Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftIdentityForward(U[nu],nu)))));
}
}
// Put V back into u_smr.
for (int mu = 0; mu < Nd; mu++) {
PokeIndex<LorentzIndex>(u_smr , V[mu] , mu);
PokeIndex<LorentzIndex>(u_naik, Vnaik[mu], mu);
}
};
// Intent: OUT--u_proj
// IN--u_mu
void projectU3(GF& u_proj, GF& u_mu) const {
auto grid = this->_grid;
LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid);
CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid),
u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid);
// Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8)
for (int mu = 0; mu < Nd; mu++) {
V = PeekIndex<LorentzIndex>(u_mu, mu);
Q = adj(V)*V;
c0 = real(trace(Q));
c1 = (1/2.)*real(trace(Q*Q));
c2 = (1/3.)*real(trace(Q*Q*Q));
S = (1/3.)*c1-(1/18.)*c0*c0;
if (norm2(S)<1e-28) {
g0 = (1/3.)*c0; g1 = g0; g2 = g1;
} else {
R = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
theta = acos(R*pow(S,-1.5));
g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta );
g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
}
// if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD }
u = sqrt(g0) + sqrt(g1) + sqrt(g2);
v = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
w = sqrt(g0*g1*g2);
den = w*(u*v-w);
f0 = (-w*(u*u+v)+u*v*v)/den;
f1 = (-w-u*u*u+2.*u*v)/den;
f2 = u/den;
id_3 = 1.;
sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q;
PokeIndex<LorentzIndex>(u_proj, V*sqrtQinv, mu);
}
};
// void derivative(const GaugeField& Gauge) const {
// };
};
NAMESPACE_END(Grid);

View File

@@ -5,4 +5,5 @@
#include <Grid/qcd/smearing/StoutSmearing.h>
#include <Grid/qcd/smearing/GaugeConfiguration.h>
#include <Grid/qcd/smearing/WilsonFlow.h>
#include <Grid/qcd/smearing/HISQSmearing.h>

View File

@@ -69,7 +69,7 @@ public:
/*! Construct stout smearing object from explicitly specified rho matrix */
Smear_Stout(const std::vector<double>& rho_)
: OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} {
std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl
std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl;
assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3");
}

View File

@@ -100,6 +100,9 @@ class GaugeGroup {
using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
template <typename vtype>
using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
template <typename vtype>
using iSUnAlgebraMatrix =
iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >;
static int su2subgroups(void) { return su2subgroups(group_name()); }
//////////////////////////////////////////////////////////////////////////////////////////////////
@@ -128,10 +131,19 @@ class GaugeGroup {
typedef Lattice<vMatrix> LatticeMatrix;
typedef Lattice<vMatrixF> LatticeMatrixF;
typedef Lattice<vMatrixD> LatticeMatrixD;
typedef Lattice<vAlgebraVector> LatticeAlgebraVector;
typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF;
typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD;
typedef iSUnAlgebraMatrix<vComplex> vAlgebraMatrix;
typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF;
typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD;
typedef Lattice<vAlgebraMatrix> LatticeAlgebraMatrix;
typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF;
typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD;
typedef iSU2Matrix<Complex> SU2Matrix;
typedef iSU2Matrix<ComplexF> SU2MatrixF;
@@ -160,7 +172,7 @@ class GaugeGroup {
return generator(lieIndex, ta, group_name());
}
static void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) {
return su2SubGroupIndex(i1, i2, su2_index, group_name());
}
@@ -389,6 +401,52 @@ class GaugeGroup {
}
}
// Ta are hermitian (?)
// Anti herm is i Ta basis
static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b)
{
conformable(in, out);
GridBase *grid = out.Grid();
LatticeComplex tmp(grid);
Matrix ta;
// Using Luchang's projection convention
// 2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a
autoView(out_v,out,AcceleratorWrite);
autoView(in_v,in,AcceleratorRead);
int N = ncolour;
int NNm1 = N * (N - 1);
int hNNm1= NNm1/2;
RealD sqrt_2 = sqrt(2.0);
Complex ci(0.0,1.0);
for(int su2Index=0;su2Index<hNNm1;su2Index++){
int i1, i2;
su2SubGroupIndex(i1, i2, su2Index);
int ax = su2Index*2;
int ay = su2Index*2+1;
accelerator_for(ss,grid->oSites(),1,{
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
// trace( Ta x Ci in)
// Bet I need to move to real part with mult by -i
out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
});
}
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
int k = diagIndex + 1; // diagIndex starts from 0
int a = NNm1+diagIndex;
RealD scale = 1.0/sqrt(2.0*k*(k+1));
accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
auto tmp = in_v[ss]()()(0,0);
for(int i=1;i<k;i++){
tmp=tmp+in_v[ss]()()(i,i);
}
tmp = tmp - in_v[ss]()()(k,k)*k;
out_v[ss]()()(a,b) =imag(tmp) * scale;
});
}
}
};
template <int ncolour>

View File

@@ -10,6 +10,7 @@
// doesn't get found by the scripts/filelist during bootstrapping.
private:
template <ONLY_IF_SU>
static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; }
////////////////////////////////////////////////////////////////////////
@@ -576,3 +577,4 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie
LieRandomize(pRNG,g,1.0);
GaugeTransform<Gimpl>(Umu,g);
}

View File

@@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
NAMESPACE_END(Grid);
#ifdef GRID_SYCL
template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vRealF > : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vRealD > : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
#endif
#endif

View File

@@ -218,6 +218,10 @@ public:
// -------------------------------------------------
// misc
// -------------------------------------------------
void discardhi(uint64_t z) {
_s[3] += z;
encrypt_counter();
}
// req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9
// Advances es state ei to ei+z by any means equivalent to z
@@ -387,4 +391,4 @@ private:
#undef MIXK
#undef MIX2
#endif
#endif

View File

@@ -137,5 +137,55 @@ public:
};
////////////////////////////////////////////////
// Some machinery to streamline making a stencil
////////////////////////////////////////////////
class shiftSignal {
public:
enum {
BACKWARD_CONST = 16,
NO_SHIFT = -1
};
};
// TODO: put a check somewhere that BACKWARD_CONST > Nd!
/*! @brief signals that you want to go backwards in direction dir */
inline int Back(const int dir) {
// generalShift will use BACKWARD_CONST to determine whether we step forward or
// backward. Trick inspired by SIMULATeQCD.
return dir + shiftSignal::BACKWARD_CONST;
}
/*! @brief shift one unit in direction dir */
template<typename... Args>
void generalShift(Coordinate& shift, int dir) {
if (dir >= shiftSignal::BACKWARD_CONST) {
dir -= shiftSignal::BACKWARD_CONST;
shift[dir]+=-1;
} else if (dir == shiftSignal::NO_SHIFT) {
; // do nothing
} else {
shift[dir]+=1;
}
}
/*! @brief follow a path of directions, shifting one unit in each direction */
template<typename... Args>
void generalShift(Coordinate& shift, int dir, Args... args) {
if (dir >= shiftSignal::BACKWARD_CONST) {
dir -= shiftSignal::BACKWARD_CONST;
shift[dir]+=-1;
} else if (dir == shiftSignal::NO_SHIFT) {
; // do nothing
} else {
shift[dir]+=1;
}
generalShift(shift, args...);
}
NAMESPACE_END(Grid);

View File

@@ -70,57 +70,6 @@ struct DefaultImplParams {
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table);
/*
template<class vobj,class cobj,class compressor>
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline));
template<class vobj,class cobj,class compressor>
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
{
int num=table.size();
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_forNB( i,num, vobj::Nsimd(), {
compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
});
rhs_v.ViewClose();
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type)
{
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
});
rhs_v.ViewClose();
}
*/
void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void);
@@ -258,6 +207,10 @@ public:
struct Packet {
void * send_buf;
void * recv_buf;
#ifndef ACCELERATOR_AWARE_MPI
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
#endif
Integer to_rank;
Integer from_rank;
Integer do_send;
@@ -324,7 +277,7 @@ public:
Vector<int> surface_list;
stencilVector<StencilEntry> _entries; // Resident in managed memory
commVector<StencilEntry> _entries_device; // Resident in managed memory
commVector<StencilEntry> _entries_device; // Resident in device memory
std::vector<Packet> Packets;
std::vector<Merge> Mergers;
std::vector<Merge> MergersSHM;
@@ -408,33 +361,16 @@ public:
// Use OpenMP Tasks for cleaner ???
// must be called *inside* parallel region
//////////////////////////////////////////
/*
void CommunicateThreaded()
{
#ifdef GRID_OMP
int mythread = omp_get_thread_num();
int nthreads = CartesianCommunicator::nCommThreads;
#else
int mythread = 0;
int nthreads = 1;
#endif
if (nthreads == -1) nthreads = 1;
if (mythread < nthreads) {
for (int i = mythread; i < Packets.size(); i += nthreads) {
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes,i);
}
}
}
*/
////////////////////////////////////////////////////////////////////////
// Non blocking send and receive. Necessarily parallel.
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
#ifdef ACCELERATOR_AWARE_MPI
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
@@ -443,16 +379,54 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#else
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
if ( Packets[i].do_send ) {
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
}
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].host_send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].host_recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#endif
// Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_send )
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
}
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
acceleratorCopySynchronise();
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
// accelerator_barrier();
_grid->StencilBarrier();
#ifndef ACCELERATOR_AWARE_MPI
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) {
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
}
}
_grid->HostBufferFreeAll();
#endif
// run any checksums
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
}
}
////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel.
@@ -528,6 +502,7 @@ public:
template<class compressor>
void HaloGather(const Lattice<vobj> &source,compressor &compress)
{
// accelerator_barrier();
_grid->StencilBarrier();// Synch shared memory on a single nodes
assert(source.Grid()==_grid);
@@ -540,10 +515,9 @@ public:
compress.Point(point);
HaloGatherDir(source,compress,point,face_idx);
}
accelerator_barrier();
accelerator_barrier(); // All my local gathers are complete
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size);
}
/////////////////////////
@@ -579,6 +553,7 @@ public:
accelerator_forNB(j, words, cobj::Nsimd(), {
coalescedWrite(to[j] ,coalescedRead(from [j]));
});
acceleratorFenceComputeStream();
}
}
@@ -669,6 +644,7 @@ public:
for(int i=0;i<dd.size();i++){
decompressor::DecompressFace(decompress,dd[i]);
}
acceleratorFenceComputeStream(); // dependent kernels
}
////////////////////////////////////////
// Set up routines
@@ -706,7 +682,7 @@ public:
}
}
}
std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@@ -761,7 +737,8 @@ public:
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances,
Parameters p=Parameters())
Parameters p=Parameters(),
bool preserve_shm=false)
{
face_table_computed=0;
_grid = grid;
@@ -855,7 +832,9 @@ public:
/////////////////////////////////////////////////////////////////////////////////
const int Nsimd = grid->Nsimd();
_grid->ShmBufferFreeAll();
// Allow for multiple stencils to exist simultaneously
if (!preserve_shm)
_grid->ShmBufferFreeAll();
int maxl=2;
u_simd_send_buf.resize(maxl);
@@ -1221,7 +1200,6 @@ public:
///////////////////////////////////////////////////////////
int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off],
xmit_to_rank, do_send,

View File

@@ -69,6 +69,35 @@ accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(t
}
return ret;
}
////////////////////////////
// Fast path traceProduct
////////////////////////////
template<class S1 , class S2, IfNotGridTensor<S1> = 0, IfNotGridTensor<S2> = 0>
accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2)
-> decltype(arg1*arg2)
{
return arg1*arg2;
}
template<class vtype,class rtype,int N >
accelerator_inline auto traceProduct(const iMatrix<vtype,N> &arg1,const iMatrix<rtype,N> &arg2) -> iScalar<decltype(trace(arg1._internal[0][0]*arg2._internal[0][0]))>
{
iScalar<decltype( trace(arg1._internal[0][0]*arg2._internal[0][0] )) > ret;
zeroit(ret._internal);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
ret._internal=ret._internal+traceProduct(arg1._internal[i][j],arg2._internal[j][i]);
}}
return ret;
}
template<class vtype,class rtype >
accelerator_inline auto traceProduct(const iScalar<vtype> &arg1,const iScalar<rtype> &arg2) -> iScalar<decltype(trace(arg1._internal*arg2._internal))>
{
iScalar<decltype(trace(arg1._internal*arg2._internal))> ret;
ret._internal=traceProduct(arg1._internal,arg2._internal);
return ret;
}
NAMESPACE_END(Grid);

View File

@@ -34,9 +34,12 @@ NAMESPACE_BEGIN(Grid);
// These are the Grid tensors
template<typename T> struct isGridTensor : public std::false_type { static constexpr bool notvalue = true; };
template<class T> struct isGridTensor<iScalar<T>> : public std::true_type { static constexpr bool notvalue = false; };
template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type { static constexpr bool notvalue = false; };
template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type { static constexpr bool notvalue = false; };
template<class T> struct isGridTensor<iScalar<T> > : public std::true_type { static constexpr bool notvalue = false; };
template<class T, int N> struct isGridTensor<iVector<T, N> >: public std::true_type { static constexpr bool notvalue = false; };
template<class T, int N> struct isGridTensor<iMatrix<T, N> >: public std::true_type { static constexpr bool notvalue = false; };
template <typename T> using IfGridTensor = Invoke<std::enable_if<isGridTensor<T>::value, int> >;
template <typename T> using IfNotGridTensor = Invoke<std::enable_if<!isGridTensor<T>::value, int> >;
// Traits to identify scalars
template<typename T> struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; };
@@ -401,3 +404,5 @@ NAMESPACE_BEGIN(Grid);
};
NAMESPACE_END(Grid);

View File

@@ -7,6 +7,8 @@ uint32_t accelerator_threads=2;
uint32_t acceleratorThreads(void) {return accelerator_threads;};
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID"
#define ENV_RANK_PALS "PALS_RANKID"
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
@@ -147,7 +149,7 @@ void acceleratorInit(void)
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
hipGetDeviceProperties(&gpu_props[i], i);
auto r=hipGetDeviceProperties(&gpu_props[i], i);
hipDeviceProp_t prop;
prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem;
@@ -208,8 +210,8 @@ void acceleratorInit(void)
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
// theCopyAccelerator = new sycl::queue (selectedDevice);
theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
theCopyAccelerator = new sycl::queue (selectedDevice);
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
zeInit(0);
@@ -228,8 +230,17 @@ void acceleratorInit(void)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);}
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = cl::sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){
@@ -241,9 +252,10 @@ void acceleratorInit(void)
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
if ( world_rank == 0) {
GPU_PROP_STR(vendor);
GPU_PROP_STR(version);
GPU_PROP_STR(vendor);
GPU_PROP_STR(version);
// GPU_PROP_STR(device_type);
/*
GPU_PROP(max_compute_units);
@@ -259,7 +271,8 @@ void acceleratorInit(void)
GPU_PROP(single_fp_config);
*/
// GPU_PROP(double_fp_config);
GPU_PROP(global_mem_size);
GPU_PROP(global_mem_size);
}
}
if ( world_rank == 0 ) {

View File

@@ -225,6 +225,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
@@ -253,17 +255,13 @@ inline int acceleratorIsCommunicable(void *ptr)
#define GRID_SYCL_LEVEL_ZERO_IPC
NAMESPACE_END(Grid);
#if 0
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
#include <level_zero/ze_api.h>
#include <CL/sycl/backend/level_zero.hpp>
#else
// Force deterministic reductions
#define SYCL_REDUCTION_DETERMINISTIC
#include <sycl/CL/sycl.hpp>
#include <sycl/usm.hpp>
#include <level_zero/ze_api.h>
#include <sycl/ext/oneapi/backend/level_zero.hpp>
#endif
NAMESPACE_BEGIN(Grid);
@@ -287,23 +285,24 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
if(nt < 8)nt=8; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
unsigned long nt=acceleratorThreads(); \
if(nt < 8)nt=8; \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ if (iter1 < unum1){ __VA_ARGS__ } }; \
}); \
});
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
@@ -405,7 +404,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
#define accelerator_barrier(dummy) \
{ \
hipStreamSynchronize(computeStream); \
auto r=hipStreamSynchronize(computeStream); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
@@ -438,19 +437,21 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);}
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
//inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
hipMemcpyDtoDAsync(to,from,bytes, copyStream);
auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
}
inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); };
inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); };
#endif
@@ -575,4 +576,11 @@ accelerator_inline void acceleratorFence(void)
return;
}
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
{
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
acceleratorCopySynchronise();
}
NAMESPACE_END(Grid);

336
Grid/util/FlightRecorder.cc Normal file
View File

@@ -0,0 +1,336 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Init.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@MacBook-Pro.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////
// Grid Norm logging for repro testing
///////////////////////////////////////////////////////
int FlightRecorder::PrintEntireLog;
int FlightRecorder::ContinueOnFail;
int FlightRecorder::LoggingMode;
int FlightRecorder::ChecksumComms;
int FlightRecorder::ChecksumCommsSend;
int32_t FlightRecorder::XmitLoggingCounter;
int32_t FlightRecorder::RecvLoggingCounter;
int32_t FlightRecorder::CsumLoggingCounter;
int32_t FlightRecorder::NormLoggingCounter;
int32_t FlightRecorder::ReductionLoggingCounter;
uint64_t FlightRecorder::ErrorCounter;
std::vector<double> FlightRecorder::NormLogVector;
std::vector<double> FlightRecorder::ReductionLogVector;
std::vector<uint64_t> FlightRecorder::CsumLogVector;
std::vector<uint64_t> FlightRecorder::XmitLogVector;
std::vector<uint64_t> FlightRecorder::RecvLogVector;
void FlightRecorder::ResetCounters(void)
{
XmitLoggingCounter=0;
RecvLoggingCounter=0;
CsumLoggingCounter=0;
NormLoggingCounter=0;
ReductionLoggingCounter=0;
}
void FlightRecorder::Truncate(void)
{
ResetCounters();
XmitLogVector.resize(0);
RecvLogVector.resize(0);
NormLogVector.resize(0);
CsumLogVector.resize(0);
ReductionLogVector.resize(0);
}
void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
{
switch ( mode ) {
case LoggingModePrint:
SetLoggingModePrint();
break;
case LoggingModeRecord:
SetLoggingModeRecord();
break;
case LoggingModeVerify:
SetLoggingModeVerify();
break;
case LoggingModeNone:
LoggingMode = mode;
Truncate();
break;
default:
assert(0);
}
}
void FlightRecorder::SetLoggingModePrint(void)
{
std::cout << " FlightRecorder: set to print output " <<std::endl;
Truncate();
LoggingMode = LoggingModePrint;
}
void FlightRecorder::SetLoggingModeRecord(void)
{
std::cout << " FlightRecorder: set to RECORD " <<std::endl;
Truncate();
LoggingMode = LoggingModeRecord;
}
void FlightRecorder::SetLoggingModeVerify(void)
{
std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
ResetCounters();
LoggingMode = LoggingModeVerify;
}
uint64_t FlightRecorder::ErrorCount(void)
{
return ErrorCounter;
}
void FlightRecorder::NormLog(double value)
{
uint64_t hex = * ( (uint64_t *)&value );
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLogVector.push_back(value);
NormLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(NormLoggingCounter < NormLogVector.size()){
uint64_t hexref = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
NormLoggingCounter,NormLogVector.size(),
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
if(!ContinueOnFail)assert(0); // Force takedown of job
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
<<" "<<hex<<" "<<hexref
<<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
}
}
}
if ( NormLogVector.size()==NormLoggingCounter ) {
std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
}
NormLoggingCounter++;
}
}
void FlightRecorder::CsumLog(uint64_t hex)
{
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLogVector.push_back(hex);
CsumLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(CsumLoggingCounter < CsumLogVector.size()) {
uint64_t hexref = CsumLogVector[CsumLoggingCounter] ;
if ( hex != hexref ) {
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
CsumLoggingCounter,hex, hexref);
fflush(stderr);
if(!ContinueOnFail) assert(0); // Force takedown of job
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
}
}
}
if ( CsumLogVector.size()==CsumLoggingCounter ) {
std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
}
CsumLoggingCounter++;
}
}
void FlightRecorder::ReductionLog(double local,double global)
{
uint64_t hex_l = * ( (uint64_t *)&local );
uint64_t hex_g = * ( (uint64_t *)&global );
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
ReductionLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
ReductionLogVector.push_back(global);
ReductionLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(ReductionLoggingCounter < ReductionLogVector.size()){
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
ReductionLoggingCounter,ReductionLogVector.size(),
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
if ( !ContinueOnFail ) assert(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
}
}
}
if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
}
ReductionLoggingCounter++;
}
}
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
{
if(LoggingMode == LoggingModeNone) return;
if ( ChecksumCommsSend ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
XmitLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
XmitLogVector.push_back(_xor);
XmitLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(XmitLoggingCounter < XmitLogVector.size()){
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
XmitLoggingCounter,XmitLogVector.size(),
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
if ( !ContinueOnFail ) assert(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
}
}
}
if ( XmitLogVector.size()==XmitLoggingCounter ) {
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
}
XmitLoggingCounter++;
}
#endif
}
}
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
{
if ( ChecksumComms ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
RecvLoggingCounter++;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
RecvLogVector.push_back(_xor);
RecvLoggingCounter++;
}
if(LoggingMode == LoggingModeVerify) {
if(RecvLoggingCounter < RecvLogVector.size()){
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
RecvLoggingCounter,RecvLogVector.size(),
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
if ( !ContinueOnFail ) assert(0);
ErrorCounter++;
} else {
if ( PrintEntireLog ) {
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
}
}
}
if ( RecvLogVector.size()==RecvLoggingCounter ) {
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
}
RecvLoggingCounter++;
}
#endif
}
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,43 @@
#pragma once
NAMESPACE_BEGIN(Grid);
class FlightRecorder {
public:
enum LoggingMode_t {
LoggingModeNone,
LoggingModePrint,
LoggingModeRecord,
LoggingModeVerify
};
static int LoggingMode;
static uint64_t ErrorCounter;
static int32_t XmitLoggingCounter;
static int32_t RecvLoggingCounter;
static int32_t CsumLoggingCounter;
static int32_t NormLoggingCounter;
static int32_t ReductionLoggingCounter;
static std::vector<uint64_t> XmitLogVector;
static std::vector<uint64_t> RecvLogVector;
static std::vector<uint64_t> CsumLogVector;
static std::vector<double> NormLogVector;
static std::vector<double> ReductionLogVector;
static int ContinueOnFail;
static int PrintEntireLog;
static int ChecksumComms;
static int ChecksumCommsSend;
static void SetLoggingModePrint(void);
static void SetLoggingModeRecord(void);
static void SetLoggingModeVerify(void);
static void SetLoggingMode(LoggingMode_t mode);
static void NormLog(double value);
static void CsumLog(uint64_t csum);
static void ReductionLog(double lcl, double glbl);
static void Truncate(void);
static void ResetCounters(void);
static uint64_t ErrorCount(void);
static void xmitLog(void *,uint64_t bytes);
static void recvLog(void *,uint64_t bytes,int rank);
};
NAMESPACE_END(Grid);

View File

@@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts)
}
#endif
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
@@ -90,7 +94,12 @@ int GridThread::_threads =1;
int GridThread::_hyperthreads=1;
int GridThread::_cores=1;
char hostname[HOST_NAME_MAX+1];
char *GridHostname(void)
{
return hostname;
}
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
const Coordinate GridDefaultSimd(int dims,int nsimd)
@@ -393,6 +402,8 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
gethostname(hostname, HOST_NAME_MAX+1);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
/////////////////////////////////////////////////////////
// Reporting

View File

@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
void Grid_init(int *argc,char ***argv);
void Grid_finalize(void);
char * GridHostname(void);
// internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void);
@@ -68,5 +70,6 @@ void GridParseLayout(char **argv,int argc,
void printHash(void);
NAMESPACE_END(Grid);

View File

@@ -1,6 +1,6 @@
#ifndef GRID_UTIL_H
#define GRID_UTIL_H
#pragma once
#include <Grid/util/Coordinate.h>
#include <Grid/util/Lexicographic.h>
#include <Grid/util/Init.h>
#endif
#include <Grid/util/FlightRecorder.h>

View File

@@ -54,15 +54,16 @@ int main(int argc, char **argv)
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
MD.MDsteps = 12;
MD.MDsteps = 24;
MD.trajL = 1.0;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 0;
HMCparams.StartTrajectory = 104;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 20;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
HMCparams.StartingType =std::string("HotStart");
// HMCparams.StartingType =std::string("HotStart");
HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
@@ -87,6 +88,7 @@ int main(int argc, char **argv)
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 16;
@@ -134,7 +136,6 @@ int main(int argc, char **argv)
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
ActionLevel<HMCWrapper::Field> Level3(4);
////////////////////////////////////
// Strange action
@@ -191,7 +192,7 @@ int main(int argc, char **argv)
Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
if( ApplySmearing ) Level2.push_back(&Jacobian);
if( ApplySmearing ) Level1.push_back(&Jacobian);
std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
@@ -200,7 +201,7 @@ int main(int argc, char **argv)
/////////////////////////////////////////////////////////////
// GaugeAction.is_smeared = ApplySmearing;
GaugeAction.is_smeared = true;
Level3.push_back(&GaugeAction);
Level2.push_back(&GaugeAction);
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
@@ -210,10 +211,11 @@ int main(int argc, char **argv)
std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
TheHMC.Run(SmearingPolicy); // for smearing

226
HMC/FTHMC2p1f_3GeV.cc Normal file
View File

@@ -0,0 +1,226 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Copyright (C) 2023
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
using namespace Grid;
int main(int argc, char **argv)
{
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
// here make a routine to print all the relevant information on the run
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
// Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
MD.MDsteps = 24;
MD.trajL = 1.0;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 0;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 20;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
// HMCparams.StartingType =std::string("HotStart");
HMCparams.StartingType =std::string("ColdStart");
// HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_EODWF_lat";
CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
CPparams.rng_prefix = "ckpoint_EODWF_rng";
CPparams.saveInterval = 1;
CPparams.saveSmeared = true;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 12;
Real beta = 2.37;
Real light_mass = 0.0047;
Real strange_mass = 0.0186;
Real pv_mass = 1.0;
RealD M5 = 1.8;
RealD b = 1.0; // Scale factor one, Shamir
RealD c = 0.0;
OneFlavourRationalParams OFRp;
OFRp.lo = 1.0e-2;
OFRp.hi = 64;
OFRp.MaxIter = 10000;
OFRp.tolerance= 1.0e-10;
OFRp.degree = 14;
OFRp.precision= 40;
std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeField U(GridPtr);
LatticeGaugeField Uhot(GridPtr);
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
double StoppingCondition = 1e-10;
double MaxCGIterations = 30000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
bool ApplySmearing = true;
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
////////////////////////////////////
// Strange action
////////////////////////////////////
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
CG,
CG, CG,
CG, CG,
OFRp, false);
EOFA.is_smeared = ApplySmearing;
Level1.push_back(&EOFA);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]);
light_num.push_back(hasenbusch[h]);
}
light_num.push_back(pv_mass);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage << " 2f quotient Action "<< light_num[h] << " / " << light_den[h]<< std::endl;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
}
for(int h=0;h<n_hasenbusch+1;h++){
Quotients[h]->is_smeared = ApplySmearing;
Level1.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// lnDetJacobianAction
/////////////////////////////////////////////////////////////
double rho = 0.1; // smearing parameter
int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd
int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd
Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
if( ApplySmearing ) Level1.push_back(&Jacobian);
std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
GaugeAction.is_smeared = ApplySmearing;
Level2.push_back(&GaugeAction);
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
} // main

226
HMC/HMC2p1f_3GeV.cc Normal file
View File

@@ -0,0 +1,226 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Copyright (C) 2023
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
using namespace Grid;
int main(int argc, char **argv)
{
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
// here make a routine to print all the relevant information on the run
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
// Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
MD.MDsteps = 24;
MD.trajL = 1.0;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 0;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 20;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
// HMCparams.StartingType =std::string("HotStart");
HMCparams.StartingType =std::string("ColdStart");
// HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_EODWF_lat";
CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
CPparams.rng_prefix = "ckpoint_EODWF_rng";
CPparams.saveInterval = 1;
CPparams.saveSmeared = true;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 12;
Real beta = 2.37;
Real light_mass = 0.0047;
Real strange_mass = 0.0186;
Real pv_mass = 1.0;
RealD M5 = 1.8;
RealD b = 1.0; // Scale factor one, Shamir
RealD c = 0.0;
OneFlavourRationalParams OFRp;
OFRp.lo = 1.0e-2;
OFRp.hi = 64;
OFRp.MaxIter = 10000;
OFRp.tolerance= 1.0e-10;
OFRp.degree = 14;
OFRp.precision= 40;
std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 });
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeField U(GridPtr);
LatticeGaugeField Uhot(GridPtr);
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
double StoppingCondition = 1e-10;
double MaxCGIterations = 30000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
bool ApplySmearing = false;
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
////////////////////////////////////
// Strange action
////////////////////////////////////
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
CG,
CG, CG,
CG, CG,
OFRp, false);
EOFA.is_smeared = ApplySmearing;
Level1.push_back(&EOFA);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]);
light_num.push_back(hasenbusch[h]);
}
light_num.push_back(pv_mass);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage << " 2f quotient Action "<< light_num[h] << " / " << light_den[h]<< std::endl;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
}
for(int h=0;h<n_hasenbusch+1;h++){
Quotients[h]->is_smeared = ApplySmearing;
Level1.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// lnDetJacobianAction
/////////////////////////////////////////////////////////////
double rho = 0.1; // smearing parameter
int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd
int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd
Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
if( ApplySmearing ) Level1.push_back(&Jacobian);
std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
GaugeAction.is_smeared = ApplySmearing;
Level2.push_back(&GaugeAction);
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
} // main

View File

@@ -0,0 +1,350 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_hmc_EODWFRatio.cc
Copyright (C) 2015-2016
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
int main(int argc, char **argv) {
using namespace Grid;
Grid_init(&argc, &argv);
CartesianCommunicator::BarrierWorld();
std::cout << GridLogMessage << " Clock skew check" <<std::endl;
int threads = GridThread::GetThreads();
// Typedefs to simplify notation
typedef WilsonImplD FermionImplPolicy;
typedef MobiusFermionD FermionAction;
typedef MobiusEOFAFermionD FermionEOFAAction;
typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
typedef GenericHMCRunner<ForceGradient> HMCWrapper;
MD.name = std::string("Force Gradient");
//typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
// MD.name = std::string("MinimumNorm2");
// TrajL = 2
// 4/2 => 0.6 dH
// 3/3 => 0.8 dH .. depth 3, slower
//MD.MDsteps = 4;
MD.MDsteps = 3;
MD.trajL = 0.5;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 1077;
HMCparams.Trajectories = 1;
HMCparams.NoMetropolisUntil= 0;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
// HMCparams.StartingType =std::string("ColdStart");
HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_DDHMC_lat";
CPparams.rng_prefix = "ckpoint_DDHMC_rng";
CPparams.saveInterval = 1;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
std::cout << "loaded NERSC checpointer"<<std::endl;
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 12;
RealD M5 = 1.8;
RealD b = 1.5;
RealD c = 0.5;
Real beta = 2.13;
// Real light_mass = 5.4e-4;
Real light_mass = 7.8e-4;
Real light_mass_dir = 0.01;
Real strange_mass = 0.0362;
Real pv_mass = 1.0;
std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
// std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
int SP_iters=9000;
RationalActionParams OFRp; // Up/down
OFRp.lo = 6.0e-5;
OFRp.hi = 90.0;
OFRp.inv_pow = 2;
OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
OFRp.action_tolerance= 1.0e-8;
OFRp.action_degree = 18;
OFRp.md_tolerance= 1.0e-7;
OFRp.md_degree = 14;
// OFRp.degree = 20; converges
// OFRp.degree = 16;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
std::vector<RealD> ActionTolByPole({
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
3.0e-7,1.0e-7,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
std::vector<RealD> MDTolByPole({
// 1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
////////////////////////////////////////////////////////////////
// Domain decomposed
////////////////////////////////////////////////////////////////
Coordinate latt4 = GridPtr->GlobalDimensions();
Coordinate mpi = GridPtr->ProcessorGrid();
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
Coordinate CommDim(Nd);
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Coordinate NonDirichlet(Nd+1,0);
Coordinate Dirichlet(Nd+1,0);
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
//Dirichlet[1] = 0;
//Dirichlet[2] = 0;
//Dirichlet[3] = 0;
//
Coordinate Block4(Nd);
Block4[0] = Dirichlet[1];
Block4[1] = Dirichlet[2];
Block4[2] = Dirichlet[3];
Block4[3] = Dirichlet[4];
int Width=4;
TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplD::Field>(Block4,Width));
//////////////////////////
// Fermion Grids
//////////////////////////
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeFieldD U(GridPtr); U=Zero();
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
std::cout << "loaded NERSC gauge field"<<std::endl;
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
FermionAction::ImplParams ParamsDir(boundary);
Params.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=0;
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
double StoppingCondition = 1e-8;
double MDStoppingCondition = 1e-8;
double MDStoppingConditionLoose = 1e-8;
double MDStoppingConditionStrange = 1e-8;
double MaxCGIterations = 300000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(3);
ActionLevel<HMCWrapper::Field> Level3(15);
////////////////////////////////////
// Strange action
////////////////////////////////////
FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params);
// Probably dominates the force - back to EOFA.
OneFlavourRationalParams SFRp;
SFRp.lo = 0.1;
SFRp.hi = 25.0;
SFRp.MaxIter = 10000;
SFRp.tolerance= 1.0e-8;
SFRp.mdtolerance= 2.0e-6;
SFRp.degree = 12;
SFRp.precision= 50;
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ConjugateGradient<FermionField> ActionCG(StoppingCondition,MaxCGIterations);
ConjugateGradient<FermionField> DerivativeCG(MDStoppingCondition,MaxCGIterations);
LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
ActionCG,
ActionCG, ActionCG,
DerivativeCG, DerivativeCG,
SFRp, true);
Level2.push_back(&EOFA);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
std::vector<int> dirichlet_den;
std::vector<int> dirichlet_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass); dirichlet_den.push_back(0);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1);
}
for(int h=0;h<n_hasenbusch;h++){
light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1);
}
light_num.push_back(pv_mass); dirichlet_num.push_back(0);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
std::vector<LinearOperatorD *> LinOpD;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage
<< " 2f quotient Action ";
std::cout << "det D("<<light_den[h]<<")";
if ( dirichlet_den[h] ) std::cout << "^dirichlet ";
std::cout << "/ det D("<<light_num[h]<<")";
if ( dirichlet_num[h] ) std::cout << "^dirichlet ";
std::cout << std::endl;
FermionAction::ImplParams ParamsNum(boundary);
FermionAction::ImplParams ParamsDen(boundary);
if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet;
else ParamsNum.dirichlet = NonDirichlet;
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
else ParamsDen.dirichlet = NonDirichlet;
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
else ParamsNum.partialDirichlet = 0;
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
else ParamsDen.partialDirichlet = 0;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
double conv = MDStoppingCondition;
if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
if(h!=0) {
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG));
} else {
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
}
}
for(int h=0;h<Bdys.size();h++){
Bdys[h]->SetTolerances(ActionTolByPole,MDTolByPole);
}
int nquo=Quotients.size();
Level1.push_back(Bdys[0]);
Level1.push_back(Bdys[1]);
Level2.push_back(Quotients[0]);
for(int h=1;h<nquo-1;h++){
Level2.push_back(Quotients[h]);
}
Level2.push_back(Quotients[nquo-1]);
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction);
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl;
/////////////////////////////////////////////////////////////
TheHMC.Run(); // no smearing
Grid_finalize();
} // main

View File

@@ -343,7 +343,7 @@ int main(int argc, char **argv) {
// Probably dominates the force - back to EOFA.
OneFlavourRationalParams SFRp;
SFRp.lo = 0.1;
SFRp.hi = 25.0;
SFRp.hi = 30.0;
SFRp.MaxIter = 10000;
SFRp.tolerance= 1.0e-5;
SFRp.mdtolerance= 2.0e-4;

View File

@@ -128,7 +128,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c
////////////////////////////////////////////////////////////////////////////////////
// Make a mixed precision conjugate gradient
////////////////////////////////////////////////////////////////////////////////////
#if 1
#if 0
RealD delta=1.e-4;
std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl;
ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD);
@@ -180,7 +180,7 @@ int main(int argc, char **argv) {
// 4/2 => 0.6 dH
// 3/3 => 0.8 dH .. depth 3, slower
//MD.MDsteps = 4;
MD.MDsteps = 14;
MD.MDsteps = 12;
MD.trajL = 0.5;
HMCparameters HMCparams;
@@ -204,7 +204,7 @@ int main(int argc, char **argv) {
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
std::cout << "loaded NERSC checpointer"<<std::endl;
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
@@ -218,15 +218,14 @@ int main(int argc, char **argv) {
RealD M5 = 1.8;
RealD b = 1.5;
RealD c = 0.5;
Real beta = 2.13;
RealD beta = 2.13;
// Real light_mass = 5.4e-4;
Real light_mass = 7.8e-4;
// Real light_mass = 7.8e-3;
Real strange_mass = 0.0362;
Real pv_mass = 1.0;
// std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
//std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
@@ -277,20 +276,20 @@ int main(int argc, char **argv) {
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
double StoppingCondition = 1e-9;
double MDStoppingCondition = 1e-8;
double MDStoppingConditionLoose = 1e-8;
double MDStoppingConditionStrange = 1e-8;
double MaxCGIterations = 300000;
double StoppingCondition = 1e-14;
double MDStoppingCondition = 1e-9;
double MDStoppingConditionLoose = 1e-9;
double MDStoppingConditionStrange = 1e-9;
double MaxCGIterations = 50000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
////////////////////////////////////
// Collect actions
////////////////////////////////////
// ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(1);
ActionLevel<HMCWrapper::Field> Level3(15);
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
ActionLevel<HMCWrapper::Field> Level3(4);
////////////////////////////////////
// Strange action
@@ -300,11 +299,11 @@ int main(int argc, char **argv) {
// Probably dominates the force - back to EOFA.
OneFlavourRationalParams SFRp;
SFRp.lo = 0.1;
SFRp.lo = 0.8;
SFRp.hi = 30.0;
SFRp.MaxIter = 10000;
SFRp.tolerance= 1.0e-8;
SFRp.mdtolerance= 2.0e-6;
SFRp.tolerance= 1.0e-12;
SFRp.mdtolerance= 1.0e-9;
SFRp.degree = 10;
SFRp.precision= 50;
@@ -355,8 +354,10 @@ int main(int argc, char **argv) {
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
ActionCG,
ActionCGL, ActionCGR,
DerivativeCGL, DerivativeCGR,
// ActionCGL, ActionCGR,
// DerivativeCGL, DerivativeCGR,
ActionCG, ActionCG,
DerivativeCG, DerivativeCG,
SFRp, true);
Level2.push_back(&EOFA);
@@ -443,13 +444,14 @@ int main(int argc, char **argv) {
}
int nquo=Quotients.size();
for(int h=0;h<nquo;h++){
Level2.push_back(Quotients[h]);
Level1.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction);
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl;

View File

@@ -0,0 +1,268 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_hmc_EODWFRatio.cc
Copyright (C) 2015-2016
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
int main(int argc, char **argv) {
using namespace Grid;
std::cout << " Grid Initialise "<<std::endl;
Grid_init(&argc, &argv);
CartesianCommunicator::BarrierWorld();
std::cout << GridLogMessage << " Clock skew check" <<std::endl;
int threads = GridThread::GetThreads();
// Typedefs to simplify notation
typedef WilsonImplD FermionImplPolicy;
typedef MobiusFermionD FermionAction;
typedef MobiusEOFAFermionD FermionEOFAAction;
typedef typename FermionAction::FermionField FermionField;
typedef WilsonImplF FermionImplPolicyF;
typedef MobiusFermionF FermionActionF;
typedef MobiusEOFAFermionF FermionEOFAActionF;
typedef typename FermionActionF::FermionField FermionFieldF;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
typedef GenericHMCRunner<ForceGradient> HMCWrapper;
MD.name = std::string("Force Gradient");
// typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
// MD.name = std::string("MinimumNorm2");
// TrajL = 2
// 4/2 => 0.6 dH
// 3/3 => 0.8 dH .. depth 3, slower
//MD.MDsteps = 4;
MD.MDsteps = 8;
MD.trajL = 0.5;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 1077;
HMCparams.Trajectories = 20;
HMCparams.NoMetropolisUntil= 0;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
HMCparams.StartingType =std::string("ColdStart");
// HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_HMC_lat";
CPparams.rng_prefix = "ckpoint_HMC_rng";
CPparams.saveInterval = 1;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
std::cout << "loaded NERSC checpointer"<<std::endl;
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 12;
RealD M5 = 1.8;
RealD b = 1.5;
RealD c = 0.5;
RealD beta = 2.13;
// Real light_mass = 5.4e-4;
Real light_mass = 7.8e-4;
// Real light_mass = 7.8e-3;
Real strange_mass = 0.0362;
Real pv_mass = 1.0;
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
//std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
////////////////////////////////////////////////////////////////
// Domain decomposed
////////////////////////////////////////////////////////////////
Coordinate latt4 = GridPtr->GlobalDimensions();
Coordinate mpi = GridPtr->ProcessorGrid();
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
//////////////////////////
// Fermion Grids
//////////////////////////
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeFieldD U(GridPtr); U=Zero();
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
TheHMC.initializeGaugeFieldAndRNGs(U);
std::cout << "loaded NERSC gauge field"<<std::endl;
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
double StoppingCondition = 1e-14;
double MDStoppingCondition = 1e-9;
double MDStoppingConditionLoose = 1e-9;
double MDStoppingConditionStrange = 1e-9;
double MaxCGIterations = 50000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
ActionLevel<HMCWrapper::Field> Level3(4);
////////////////////////////////////
// Strange action
////////////////////////////////////
FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params);
FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params);
// Probably dominates the force - back to EOFA.
OneFlavourRationalParams SFRp;
SFRp.lo = 0.8;
SFRp.hi = 30.0;
SFRp.MaxIter = 10000;
SFRp.tolerance= 1.0e-12;
SFRp.mdtolerance= 1.0e-9;
SFRp.degree = 10;
SFRp.precision= 50;
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ConjugateGradient<FermionField> ActionCG(StoppingCondition,MaxCGIterations);
ConjugateGradient<FermionField> DerivativeCG(MDStoppingCondition,MaxCGIterations);
LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L);
LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R);
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
ActionCG,
ActionCG, ActionCG,
DerivativeCG, DerivativeCG,
SFRp, true);
Level2.push_back(&EOFA);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]);
}
for(int h=0;h<n_hasenbusch;h++){
light_num.push_back(hasenbusch[h]);
}
light_num.push_back(pv_mass);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
std::vector<LinearOperatorD *> LinOpD;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage
<< " 2f quotient Action ";
std::cout << "det D("<<light_den[h]<<")";
std::cout << "/ det D("<<light_num[h]<<")";
std::cout << std::endl;
FermionAction::ImplParams ParamsNum(boundary);
FermionAction::ImplParams ParamsDen(boundary);
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
double conv = MDStoppingCondition;
if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG,CG));
}
int nquo=Quotients.size();
for(int h=0;h<nquo;h++){
Level1.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction);
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl;
/////////////////////////////////////////////////////////////
TheHMC.Run(); // no smearing
Grid_finalize();
} // main

22
MPI_benchmark/bench2.pbs Normal file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
#PBS -q EarlyAppAccess
#PBS -l select=2
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
export TZ='/usr/share/zoneinfo/US/Central'
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=3
unset OMP_PLACES
cd $PBS_O_WORKDIR
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS=12 # Number of MPI ranks per node
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
NTOTRANKS=$(( NNODES * NRANKS ))
CMD="mpiexec -np 2 -ppn 1 -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1"
$CMD

View File

@@ -0,0 +1 @@
mpicxx -fsycl halo_mpi.cc -o halo_mpi

View File

@@ -0,0 +1,30 @@
#!/bin/bash
export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
#unset EnableWalkerPartition
#export EnableImplicitScaling=0
#export GRID_MPICH_NIC_BIND=$NIC
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
numactl -m $PNUMA -N $NUMA "$@"

333
MPI_benchmark/halo_mpi.cc Normal file
View File

@@ -0,0 +1,333 @@
#include <cassert>
#include <complex>
#include <memory>
#include <vector>
#include <algorithm>
#include <array>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <ctime>
#include <sys/time.h>
#include <mpi.h>
/**************************************************************
* GPU - GPU memory cartesian halo exchange benchmark
* Config: what is the target
**************************************************************
*/
#undef ACC_CUDA
#undef ACC_HIP
#define ACC_SYCL
#undef ACC_NONE
/**************************************************************
* Some MPI globals
**************************************************************
*/
MPI_Comm WorldComm;
MPI_Comm WorldShmComm;
int WorldSize;
int WorldRank;
int WorldShmSize;
int WorldShmRank;
/**************************************************************
* Allocate buffers on the GPU, SYCL needs an init call and context
**************************************************************
*/
#ifdef ACC_CUDA
#include <cuda.h>
void acceleratorInit(void){}
void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
assert(err==cudaSuccess);
return ptr;
}
void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}
#endif
#ifdef ACC_HIP
#include <hip/hip_runtime.h>
void acceleratorInit(void){}
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);};
#endif
#ifdef ACC_SYCL
#include <sycl/CL/sycl.hpp>
#include <sycl/usm.hpp>
cl::sycl::queue *theAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
#if 1
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theAccelerator = new sycl::queue (selectedDevice);
#else
cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v };
theAccelerator = new sycl::queue (selectedDevice);
#endif
auto name = theAccelerator->get_device().get_info<sycl::info::device::name>();
printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout);
}
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);};
#endif
#ifdef ACC_NONE
void acceleratorInit(void){}
inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
/**************************************************************
* Microsecond timer
**************************************************************
*/
inline double usecond(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec;
}
/**************************************************************
* Main benchmark routine
**************************************************************
*/
void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall)
{
int64_t words = 3*4*2;
int64_t face,vol;
int Nd=cart_geom.size();
/**************************************************************
* L^Nd volume, L^(Nd-1) faces, 12 complex per site
* Allocate memory for these
**************************************************************
*/
face=1; for( int d=0;d<Nd-1;d++) face = face*L;
vol=1; for( int d=0;d<Nd;d++) vol = vol*L;
std::vector<void *> send_bufs;
std::vector<void *> recv_bufs;
size_t vw = face*words;
size_t bytes = face*words*sizeof(double);
if ( use_device ) {
for(int d=0;d<2*Nd;d++){
send_bufs.push_back(acceleratorAllocDevice(bytes));
recv_bufs.push_back(acceleratorAllocDevice(bytes));
}
} else {
for(int d=0;d<2*Nd;d++){
send_bufs.push_back(malloc(bytes));
recv_bufs.push_back(malloc(bytes));
}
}
/*********************************************************
* Build cartesian communicator
*********************************************************
*/
int ierr;
int rank;
std::vector<int> coor(Nd);
MPI_Comm communicator;
std::vector<int> periodic(Nd,1);
MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&rank);
MPI_Cart_coords(communicator,rank,Nd,&coor[0]);
static int reported;
if ( ! reported ) {
printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank,
coor[0],coor[1],coor[2],coor[3]); fflush(stdout);
reported =1 ;
}
/*********************************************************
* Perform halo exchanges
*********************************************************
*/
for(int d=0;d<Nd;d++){
if ( cart_geom[d]>1 ) {
double t0=usecond();
int from,to;
MPI_Barrier(communicator);
for(int n=0;n<ncall;n++){
void *xmit = (void *)send_bufs[d];
void *recv = (void *)recv_bufs[d];
ierr=MPI_Cart_shift(communicator,d,1,&from,&to);
assert(ierr==0);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
xmit = (void *)send_bufs[Nd+d];
recv = (void *)recv_bufs[Nd+d];
ierr=MPI_Cart_shift(communicator,d,-1,&from,&to);
assert(ierr==0);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
MPI_Barrier(communicator);
double t1=usecond();
double dbytes = bytes*WorldShmSize;
double xbytes = dbytes*2.0*ncall;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
if ( ! WorldRank ) {
printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout);
}
}
}
/*********************************************************
* Free memory
*********************************************************
*/
if ( use_device ) {
for(int d=0;d<2*Nd;d++){
acceleratorFreeDevice(send_bufs[d]);
acceleratorFreeDevice(recv_bufs[d]);
}
} else {
for(int d=0;d<2*Nd;d++){
free(send_bufs[d]);
free(recv_bufs[d]);
}
}
}
/**************************************
* Command line junk
**************************************/
std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option)
{
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end) {
std::string payload(*itr);
return payload;
}
return std::string("");
}
bool CmdOptionExists(char** begin, char** end, const std::string& option)
{
return std::find(begin, end, option) != end;
}
void CmdOptionIntVector(const std::string &str,std::vector<int> & vec)
{
vec.resize(0);
std::stringstream ss(str);
int i;
while (ss >> i){
vec.push_back(i);
if(std::ispunct(ss.peek()))
ss.ignore();
}
return;
}
/**************************************
* Command line junk
**************************************/
int main(int argc, char **argv)
{
std::string arg;
acceleratorInit();
MPI_Init(&argc,&argv);
WorldComm = MPI_COMM_WORLD;
MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldComm ,&WorldRank);
MPI_Comm_size(WorldComm ,&WorldSize);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldSize/WorldShmSize > 2) {
printf("This benchmark is meant to run on at most two nodes only\n");
}
auto mpi =std::vector<int>({1,1,1,1});
if( CmdOptionExists(argv,argv+argc,"--mpi") ){
arg = CmdOptionPayload(argv,argv+argc,"--mpi");
CmdOptionIntVector(arg,mpi);
} else {
printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n");
exit(0);
}
if( !WorldRank ) {
printf("***********************************\n");
printf("%d ranks\n",WorldSize);
printf("%d ranks-per-node\n",WorldShmSize);
printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout);
printf("Cartesian layout: ");
for(int d=0;d<mpi.size();d++){
printf("%d ",mpi[d]);
}
printf("\n");fflush(stdout);
printf("***********************************\n");
}
if( !WorldRank ) {
printf("=========================================================\n");
printf("= Benchmarking HOST memory MPI performance \n");
printf("=========================================================\n");fflush(stdout);
printf("= L\t pkt bytes\t MB/s \n");
printf("=========================================================\n");fflush(stdout);
}
for(int L=16;L<=64;L+=4){
Benchmark(L,mpi,false,100);
}
if( !WorldRank ) {
printf("=========================================================\n");
printf("= Benchmarking DEVICE memory MPI performance \n");
printf("=========================================================\n");fflush(stdout);
}
for(int L=16;L<=64;L+=4){
Benchmark(L,mpi,true,100);
}
if( !WorldRank ) {
printf("=========================================================\n");
printf("= DONE \n");
printf("=========================================================\n");
}
MPI_Finalize();
}

View File

@@ -90,11 +90,11 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
// Dirichlet[0] = 0;
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet);
@@ -105,11 +105,11 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
// Dirichlet[0] = 0;
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
Benchmark(Ls,Dirichlet);
@@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
GaugeField Umu(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
// SU<Nc>::ColdConfiguration(Umu);
UmuCopy=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
@@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet)
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
std::cout<<GridLogMessage << "RESULT" << std::endl;
// std::cout << result<<std::endl;
std::cout << norm2(result)<<std::endl;
std::cout<<GridLogMessage << "REF" << std::endl;
std::cout << norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "ERR" << std::endl;
std::cout << norm2(err)<<std::endl;
FGrid->Barrier();
exit(-1);
}
assert (n2e< 1.0e-4 );

View File

@@ -0,0 +1,968 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_usqcd.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
using namespace Grid;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
FILE * FP;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<"bytes\t MB/s uni \t\t MB/s bidi "<<std::endl;
};
struct controls {
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
};
class Benchmark {
public:
static void Decomposition (void ) {
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvReal : "<<sizeof(vReal )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplex : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
static void Comms(void)
{
int Nloop=200;
int nmu=0;
int maxlat=32;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header();
fprintf(FP,"Communications\n\n");
fprintf(FP,"Packet bytes, direction, GB/s per node\n");
for(int lat=16;lat<=maxlat;lat+=8){
// for(int Ls=8;Ls<=8;Ls*=2){
{ int Ls=12;
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
//Grid.ShmBufferFreeAll();
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
// bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
// int ncomm;
double dbytes;
for(int dir=0;dir<8;dir++) {
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
std::vector<double> times(Nloop);
for(int i=0;i<Nloop;i++){
dbytes=0;
double start=usecond();
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes);
dbytes+=bytes;
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<< bytes << " \t "
<<xbytes/timestat.mean
<< "\t\t"
<< bidibytes/timestat.mean<< std::endl;
fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
}
}
for(int d=0;d<8;d++){
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
}
}
}
fprintf(FP,"\n\n");
return;
}
static void Memory(void)
{
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
typedef iVector<vReal,Nvec> Vec;
Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
fprintf(FP,"Memory Bandwidth\n\n");
fprintf(FP,"Bytes, GB/s per node\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
// uint64_t NP;
uint64_t NN;
uint64_t lmax=40;
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// NP= Grid.RankCount();
NN =Grid.NodeCount();
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=Zero();
LatticeVec x(&Grid); x=Zero();
LatticeVec y(&Grid); y=Zero();
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
}
fprintf(FP,"\n\n");
};
static void BLAS(void)
{
//int nbasis, int nrhs, int coarseVol
int basis[] = { 16,32,64 };
int rhs[] = { 8,16,32 };
int vol = 4*4*4*4;
GridBLAS blas;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=basis[b];
int N=rhs[r];
int K=basis[b];
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}}
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=basis[b];
int N=rhs[r];
int K=vol;
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}}
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=rhs[r];
int N=vol;
int K=basis[b];
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}}
fprintf(FP,"\n\n\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
};
static void SU4(void)
{
const int Nc4=4;
typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t NN;
uint64_t lmax=32;
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
NN =Grid.NodeCount();
LatticeSU4 z(&Grid); z=Zero();
LatticeSU4 x(&Grid); x=Zero();
LatticeSU4 y(&Grid); y=Zero();
// double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=x*y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
}
};
static double DWF(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Nc : "<<Nc<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
typedef DomainWallFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation ////////////
Gauge Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
Fermion src (FGrid); random(RNG5,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#ifdef AVX512
const int num_cases = 3;
#else
const int num_cases = 2;
#endif
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
};
for(int c=0;c<num_cases;c++) {
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
// Nc=3 gives
// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2
// double flops=(1344.0*volume)/2;
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2;
double flops=(fps*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
static double Staggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
typedef ImprovedStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
const int num_cases = 2;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
};
for(int c=0;c<num_cases;c++) {
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1146.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
static double Clover(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark Clover on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD csw=1.0;
typedef WilsonCloverFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion r (FGrid);
{
const int num_cases = 1;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
};
for(int c=0;c<num_cases;c++) {
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dc.M(src,r);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dc.M(src,r);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344+ 24+6*6*8*2)*volume;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
if (GlobalSharedMemory::WorldRank==0) {
FP = fopen("Benchmark_usqcd.csv","w");
} else {
FP = fopen("/dev/null","w");
}
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
Benchmark::Decomposition();
int do_su4=0;
int do_memory=1;
int do_comms =1;
int do_blas =1;
int sel=4;
std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1;
std::vector<double> clover;
std::vector<double> dwf4;
std::vector<double> staggered;
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
clover.push_back(Benchmark::DWF(1,L_list[l]));
}
Ls=12;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::DWF(Ls,L_list[l]) ;
dwf4.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::Staggered(L_list[l]) ;
staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int NN=NN_global;
if ( do_memory ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Memory();
}
if ( do_blas ) {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::BLAS();
#endif
}
if ( do_su4 ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::SU4();
}
if ( do_comms ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Comms();
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
fprintf(FP,"Per node summary table\n");
fprintf(FP,"\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
fprintf(FP,"\n");
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
}
fprintf(FP,"\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Grid_finalize();
fclose(FP);
}

View File

@@ -1,12 +1,12 @@
#!/usr/bin/env bash
set -e
EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'
echo "-- deploying Eigen source..."
ARC=`basename ${EIGEN_URL}`
ARC=$(basename ${EIGEN_URL})
wget ${EIGEN_URL} --no-check-certificate
if command -v sha256sum; then
echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \
@@ -14,13 +14,8 @@ if command -v sha256sum; then
else
echo "WARNING: could not verify checksum, please install sha256sum" >&2
fi
./scripts/update_eigen.sh ${ARC}
rm ${ARC}
# patch for non-portable includes in Eigen 3.3.5
# apparently already fixed in Eigen HEAD so it should not be
# a problem in the future (A.P.)
patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
./scripts/update_eigen.sh "${ARC}"
rm "${ARC}"
echo '-- generating Make.inc files...'
./scripts/filelist
echo '-- generating configure script...'

View File

@@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
esac
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
AC_ARG_ENABLE([accelerator-cshift],
[AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
[ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
AC_ARG_ENABLE([accelerator-aware-mpi],
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
AC_ARG_ENABLE([ucx-buggy],
[AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
[ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
case ${ac_UCXBUGGY} in
case ${ac_ACCELERATOR_AWARE_MPI} in
yes)
ac_ACC_CSHIFT=no;;
*);;
esac
case ${ac_ACC_CSHIFT} in
yes)
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
*);;
esac

View File

@@ -0,0 +1,183 @@
/*
* Example_plaquette.cc
*
* D. Clarke
*
* Here I just want to create an incredibly simple main to get started with GRID and get used
* to its syntax. If the reader is like me, they vaguely understand something about lattice coding,
* they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID.
*
* Once you've made a new executable, like this one, you can bootstrap.sh again. At this point,
* the code should be able to find your new executable. You can tell that bootstrap.sh worked by
* having a look at Make.inc. You should see your executable inside there.
*
* Warning: This code illustrative only, not well tested, and not meant for production use. The best
* way to read this code is to start at the main.
*
*/
// All your mains should have this
#include <Grid/Grid.h>
using namespace Grid;
// This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in
// detail what everything does so we can see how GRID works.
template <class Gimpl> class WLoops : public Gimpl {
public:
// Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are
// already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that
// implement this equivalence at compile time.
INHERIT_GIMPL_TYPES(Gimpl);
// Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built
// out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base
// field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which
// are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names
// indicates that they should be for internal use only. It seems like these base types have the
// acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things
// appears to be controlled through a template parameter called vtype.
// The general math/physics objects, such as a color matrix, are built up by nesting these objects.
// For instance a general color matrix has two color indices, so it's built up like
// iScalar<iScalar<iMatrix<vtype ...
// where the levels going from the inside out are color, spin, then Lorentz indices. Scalars have
// no indices, so it's what we use when such an index isn't needed. Lattice objects are made by one
// higher level of indexing using iVector.
// These types will be used for U and U_mu objects, respectively.
typedef typename Gimpl::GaugeLinkField GaugeMat;
typedef typename Gimpl::GaugeField GaugeLorentz;
// U_mu_nu(x)
static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
// Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together
// the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward
// has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link.
plaq = Gimpl::CovShiftForward(U[mu],mu,
// Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order.
Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftBackward(U[mu],mu,
// This means Cshift(adj(Link), mu, -1)
Gimpl::CovShiftIdentityBackward(U[nu], nu))));
}
// tr U_mu_nu(x)
static void traceDirPlaquette(ComplexField &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
// This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument
// to instantiate a Lattice object.
GaugeMat sp(U[0].Grid());
dirPlaquette(sp, U, mu, nu);
plaq = trace(sp);
}
// sum_mu_nu tr U_mu_nu(x)
static void sitePlaquette(ComplexField &Plaq, const std::vector<GaugeMat> &U) {
ComplexField sitePlaq(U[0].Grid());
Plaq = Zero();
// Nd=4 and Nc=3 are set as global constants in QCD.h
for (int mu = 1; mu < Nd; mu++) {
for (int nu = 0; nu < mu; nu++) {
traceDirPlaquette(sitePlaq, U, mu, nu);
Plaq = Plaq + sitePlaq;
}
}
}
// sum_mu_nu_x Re tr U_mu_nu(x)
static RealD sumPlaquette(const GaugeLorentz &Umu) {
std::vector<GaugeMat> U(Nd, Umu.Grid());
for (int mu = 0; mu < Nd; mu++) {
// Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can
// access the element in the mu Lorentz index with this PeekIndex syntax.
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
ComplexField Plaq(Umu.Grid());
sitePlaquette(Plaq, U);
// I guess this should be the line that sums over all space-time sites.
auto Tp = sum(Plaq);
// Until now, we have been working with objects inside the tensor nest. This TensorRemove gets
// rid of the tensor nest to return whatever is inside.
auto p = TensorRemove(Tp);
return p.real();
}
// < Re tr U_mu_nu(x) >
static RealD avgPlaquette(const GaugeLorentz &Umu) {
// Real double type
RealD sumplaq = sumPlaquette(Umu);
// gSites() is the number of global sites. there is also lSites() for local sites.
double vol = Umu.Grid()->gSites();
// The number of orientations. 4*3/2=6 for Nd=4, as known.
double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
return sumplaq / vol / faces / Nc;
}
};
// Next we show an example of how to construct an input parameter class. We first inherit
// from Serializable. Then all class data members have to be defined using the
// GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many
// class data members. In the below case, we make a parameter file holding the configuration
// name. Here, it expects the name to be labeled with "conf_name" in the configuration file.
struct ConfParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(
ConfParameters,
std::string, conf_name);
template <class ReaderClass>
ConfParameters(Reader<ReaderClass>& Reader){
// If we are reading an XML file, it should be structured like:
// <grid>
// <parameters>
// <conf_name>l20t20b06498a_nersc.302500</conf_name>
// </parameters>
// </grid>
read(Reader, "parameters", *this);
}
};
// This syntax lets you pass command line arguments to main. An asterisk means that what follows is
// a pointer. Two asterisks means what follows is a pointer to an array.
int main (int argc, char **argv)
{
// This initializes Grid. Some command line options include
// --mpi n.n.n.n
// --threads n
// --grid n.n.n.n
Grid_init(&argc, &argv);
// This is where you would specify a custom lattice size, if not from the command line. Here
// Nd is a global quantity that is currently set to 4.
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
Coordinate latt_size = GridDefaultLatt();
// Instantiate the spacetime Grid on which everything will be built.
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
// The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD
// type that you can use, which will work perfectly with what follows.
PeriodicGimplD::Field U(&GRID);
// Here we read in the parameter file params.json to get conf_name. The last argument is what the
// top organizational level is called in the param file.
XmlReader Reader("Example_plaquette.xml",false, "grid");
ConfParameters param(Reader);
// Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717
FieldMetaData header;
NerscIO::readConfiguration(U, header, param.conf_name);
// Let's see what we find.
RealD plaq = WLoops<PeriodicGimplD>::avgPlaquette(U);
// This is how you make log messages.
std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) << "Plaquette = " << plaq << std::endl;
// To wrap things up.
Grid_finalize();
}

View File

@@ -1,19 +0,0 @@
--- ./Eigen/unsupported/Eigen/CXX11/Tensor 2018-07-23 10:33:42.000000000 +0100
+++ Tensor 2018-08-28 16:15:56.000000000 +0100
@@ -25,7 +25,7 @@
#include <utility>
#endif
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "../SpecialFunctions"
#include "src/util/CXX11Meta.h"
@@ -147,6 +147,6 @@
#include "src/Tensor/TensorIO.h"
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
//#endif // EIGEN_CXX11_TENSOR_MODULE

View File

@@ -0,0 +1,67 @@
#!/bin/bash
#PBS -q debug
#PBS -l select=1
#PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
module load pti-gpu
#cat $PBS_NODEFILE
export OMP_NUM_THREADS=4
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 2 nodes, 24 ranks
#
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
#$CMD | tee 1node.comms
CMD="mpiexec -np 1 -ppn 1 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
#$CMD | tee 1tile.dwf
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1node.32.32.32.48.dwf
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
#$CMD | tee 1node.64.64.32.96.dwf
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
#$CMD | tee 1node.64.32.32.48.dwf

View File

@@ -0,0 +1,60 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=1024
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
export FI_CXI_CQ_FILL_PERCENT=10
export FI_CXI_DEFAULT_CQ_SIZE=262144
#export FI_CXI_DEFAULT_CQ_SIZE=131072
#export FI_CXI_CQ_FILL_PERCENT=20
# 12 ppn, 32 nodes, 384 ranks
#
CMD="mpiexec -np 12288 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 12288 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1024node.dwf.small.cq
CMD="mpiexec -np 12288 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1024node.dwf.cq

View File

@@ -0,0 +1,55 @@
#!/bin/bash
#PBS -q workq
#PBS -l select=2
#PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
module load pti-gpu
#cat $PBS_NODEFILE
export OMP_NUM_THREADS=4
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 2 nodes, 24 ranks
#
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD | tee 2node.comms
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 2node.32.32.64.48.dwf
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 2node.64.64.64.96.dwf

View File

@@ -0,0 +1,56 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=2048
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 32 nodes, 384 ranks
#
CMD="mpiexec -np 24576 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 24576 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 2048node.dwf.small
CMD="mpiexec -np 24576 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 2048node.dwf

View File

@@ -0,0 +1,48 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=256
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 32 nodes, 384 ranks
#
CMD="mpiexec -np 3072 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 3072 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 256node.dwf.large

View File

@@ -0,0 +1,48 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=512
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 32 nodes, 384 ranks
#
CMD="mpiexec -np 6144 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 6144 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 512node.dwf.large

View File

@@ -0,0 +1,80 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=32
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 32 nodes, 384 ranks
#
CMD="mpiexec -np 384 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1node.dwf
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 2node.dwf
CMD="mpiexec -np 48 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 4node.dwf
CMD="mpiexec -np 96 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 8node.dwf
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 16node.dwf
CMD="mpiexec -np 384 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 32node.dwf

View File

@@ -0,0 +1,34 @@
#!/bin/bash
#export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 )
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
unset EnableWalkerPartition
export EnableImplicitScaling=0
export ZE_AFFINITY_MASK=$gpu_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
if [ $PALS_RANKID = "0" ]
then
# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@"
# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
numactl -m $NUMA -N $NUMA "$@"
else
numactl -m $NUMA -N $NUMA "$@"
fi

View File

@@ -0,0 +1,29 @@
#!/bin/bash
export NUMA_MAP=(2 2 3 3 2 2 3 3 )
export PROC_MAP=(0 0 1 1 0 0 1 1 )
export NIC_MAP=(0 0 4 4 1 1 5 5 )
export GPU_MAP=(0 1 3 4 0 1 3 4 )
export TILE_MAP=(0 0 0 0 1 1 1 1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]}
#export GRID_MPICH_NIC_BIND=$NIC
unset EnableWalkerPartition
export EnableImplicitScaling=0
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA"
numactl -m $NUMA -N $PROC_MAP "$@"

View File

@@ -0,0 +1,16 @@
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=yes\
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"

View File

@@ -0,0 +1,9 @@
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128
module use /soft/modulefiles
module load intel_compute_runtime/release/agama-devel-682.22

View File

@@ -0,0 +1,2 @@
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu

View File

@@ -0,0 +1,28 @@
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
module load oneapi/release/2023.12.15.001
#module use /soft/modulefiles
#module load intel_compute_runtime/release/agama-devel-682.22
export FI_CXI_DEFAULT_CQ_SIZE=131072
export FI_CXI_CQ_FILL_PERCENT=20
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
#
# -ftarget-register-alloc-mode=pvc:default
# -ftarget-register-alloc-mode=pvc:small
# -ftarget-register-alloc-mode=pvc:large
# -ftarget-register-alloc-mode=pvc:auto
#
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@@ -0,0 +1,41 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=128
#PBS -l walltime=02:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 16 nodes, 192 ranks
# 12 ppn, 128 nodes, 1536 ranks
CMD="mpiexec -np 1536 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
$CMD

View File

@@ -0,0 +1,61 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -l select=16:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=01:00:00
#PBS -N dwf
#PBS -k doe
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
#source ../sourceme.sh
cat $PBS_NODEFILE
#export MPICH_COLL_SYNC=1
#export MPICH_ENV_DISPLAY=1
export MPICH_
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu
#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
DIR=repro.$PBS_JOBID
mkdir $DIR
cd $DIR
CMD="mpiexec -np 192 -ppn 12 -envall \
../gpu_tile_compact.sh \
../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative"
#--comms-overlap
$CMD
grep Oops Grid.stderr.* > failures.$PBS_JOBID
rm core.*

View File

@@ -0,0 +1,82 @@
#!/bin/bash
#PBS -l select=16:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=02:00:00
#PBS -N repro1gpu
#PBS -k doe
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
cd $PBS_O_WORKDIR
NN=`cat $PBS_NODEFILE | wc -l`
echo $PBS_NODEFILE
cat $PBS_NODEFILE
echo $NN nodes in node file
for n in `eval echo {1..$NN}`
do
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
echo Node $n is $THIS_NODE
for g in {0..11}
do
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
export numa=${NUMA_MAP[$g]}
export gpu_id=${GPU_MAP[$g]}
export tile_id=${TILE_MAP[$g]}
export gpu=$gpu_id.$tile_id
cd $PBS_O_WORKDIR
DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
mkdir -p $DIR
cd $DIR
echo $THIS_NODE > nodefile
echo $gpu > gpu
export ZE_AFFINITY_MASK=$gpu
export ONEAPI_DEVICE_FILTER=gpu,level_zero
CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \
numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
echo $CMD
$CMD &
done
done
wait

View File

@@ -0,0 +1,98 @@
#!/bin/bash
#PBS -l select=32:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=02:00:00
#PBS -N reproN
#PBS -k doe
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
module load oneapi/eng-compiler/2023.05.15.003
module load mpich/51.2/icc-all-deterministic-pmix-gpu
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=0
export GRID_CHECKSUM_SEND_BUF=0
export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
cd $PBS_O_WORKDIR
NN=`cat $PBS_NODEFILE | wc -l`
echo $PBS_NODEFILE
cat $PBS_NODEFILE
echo $NN nodes in node file
for n in `eval echo {1..$NN}`
do
cd $PBS_O_WORKDIR
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
echo Node $n is $THIS_NODE
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
mkdir -p $DIR
cd $DIR
echo $THIS_NODE > nodefile
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
# ../../gpu_tile_compact.sh \
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
../../gpu_tile_compact.sh \
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
echo $CMD > command-line
env > environment
$CMD &
done
# Suspicious wait is allowing jobs to collide and knock out
#wait
sleep 6500
for n in ` eval echo {1..$NN} `
do
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
cd $DIR
grep Oops Grid.stderr.* > failures.$PBS_JOBID
rm core.*
done

View File

@@ -0,0 +1,40 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=16
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 16 nodes, 192 ranks
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
$CMD

View File

@@ -0,0 +1,70 @@
Memory Bandwidth
Bytes, GB/s per node
3145728, 225.900365
50331648, 2858.859504
254803968, 4145.556367
805306368, 4905.772480
1966080000, 4978.312557
GEMM
M, N, K, BATCH, GF/s per rank
16, 8, 16, 256, 1.713639
16, 16, 16, 256, 288.268316
16, 32, 16, 256, 597.053950
32, 8, 32, 256, 557.382591
32, 16, 32, 256, 1100.145311
32, 32, 32, 256, 1885.080449
64, 8, 64, 256, 1725.163599
64, 16, 64, 256, 3389.336566
64, 32, 64, 256, 4168.252422
16, 8, 256, 256, 1326.262134
16, 16, 256, 256, 2318.095475
16, 32, 256, 256, 3555.436503
32, 8, 256, 256, 1920.139170
32, 16, 256, 256, 3486.174753
32, 32, 256, 256, 5320.821724
64, 8, 256, 256, 2539.597502
64, 16, 256, 256, 5003.456775
64, 32, 256, 256, 7837.531562
8, 256, 16, 256, 1427.848170
16, 256, 16, 256, 2222.147815
32, 256, 16, 256, 2877.121715
8, 256, 32, 256, 1922.890086
16, 256, 32, 256, 3199.469082
32, 256, 32, 256, 4845.405343
8, 256, 64, 256, 2639.483343
16, 256, 64, 256, 5012.800299
32, 256, 64, 256, 7216.006882
Communications
Packet bytes, direction, GB/s per node
4718592, 2, 206.570734
4718592, 3, 207.501847
4718592, 6, 189.730277
4718592, 7, 204.301218
15925248, 2, 307.882997
15925248, 3, 287.901076
15925248, 6, 295.603109
15925248, 7, 300.682033
37748736, 2, 331.740364
37748736, 3, 338.610627
37748736, 6, 332.580657
37748736, 7, 336.336579
Per node summary table
L , Wilson, DWF4, Staggered, GF/s per node
8 , 16, 1165, 10
12 , 473, 4901, 163
16 , 1436, 8464, 442
24 , 4133, 10139, 1530
32 , 5726, 11487, 2518
1 Memory Bandwidth
2 Bytes, GB/s per node
3 3145728, 225.900365
4 50331648, 2858.859504
5 254803968, 4145.556367
6 805306368, 4905.772480
7 1966080000, 4978.312557
8 GEMM
9 M, N, K, BATCH, GF/s per rank
10 16, 8, 16, 256, 1.713639
11 16, 16, 16, 256, 288.268316
12 16, 32, 16, 256, 597.053950
13 32, 8, 32, 256, 557.382591
14 32, 16, 32, 256, 1100.145311
15 32, 32, 32, 256, 1885.080449
16 64, 8, 64, 256, 1725.163599
17 64, 16, 64, 256, 3389.336566
18 64, 32, 64, 256, 4168.252422
19 16, 8, 256, 256, 1326.262134
20 16, 16, 256, 256, 2318.095475
21 16, 32, 256, 256, 3555.436503
22 32, 8, 256, 256, 1920.139170
23 32, 16, 256, 256, 3486.174753
24 32, 32, 256, 256, 5320.821724
25 64, 8, 256, 256, 2539.597502
26 64, 16, 256, 256, 5003.456775
27 64, 32, 256, 256, 7837.531562
28 8, 256, 16, 256, 1427.848170
29 16, 256, 16, 256, 2222.147815
30 32, 256, 16, 256, 2877.121715
31 8, 256, 32, 256, 1922.890086
32 16, 256, 32, 256, 3199.469082
33 32, 256, 32, 256, 4845.405343
34 8, 256, 64, 256, 2639.483343
35 16, 256, 64, 256, 5012.800299
36 32, 256, 64, 256, 7216.006882
37 Communications
38 Packet bytes, direction, GB/s per node
39 4718592, 2, 206.570734
40 4718592, 3, 207.501847
41 4718592, 6, 189.730277
42 4718592, 7, 204.301218
43 15925248, 2, 307.882997
44 15925248, 3, 287.901076
45 15925248, 6, 295.603109
46 15925248, 7, 300.682033
47 37748736, 2, 331.740364
48 37748736, 3, 338.610627
49 37748736, 6, 332.580657
50 37748736, 7, 336.336579
51 Per node summary table
52 L , Wilson, DWF4, Staggered, GF/s per node
53 8 , 16, 1165, 10
54 12 , 473, 4901, 163
55 16 , 1436, 8464, 442
56 24 , 4133, 10139, 1530
57 32 , 5726, 11487, 2518

View File

@@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$LIME \
--disable-accelerator-cshift \
--enable-accelerator-cshift \
--disable-unified \
CXX=nvcc \
LDFLAGS="-cudart shared " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"

View File

@@ -1,5 +1,5 @@
module load GCC/9.3.0
module load GMP/6.2.0
module load MPFR/4.1.0
module load OpenMPI/4.1.0rc1
module load CUDA/11.3
module load GCC
module load GMP
module load MPFR
module load OpenMPI
module load CUDA

View File

@@ -0,0 +1,23 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"

View File

@@ -0,0 +1,13 @@
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
module load emacs
module load PrgEnv-gnu
module load rocm
module load cray-mpich/8.1.23
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH

View File

@@ -0,0 +1,57 @@
#!/bin/bash -l
#SBATCH --job-name=fthmc3ge
#SBATCH --partition=small-g
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
##SBATCH --cpus-per-task=8
#SBATCH --gpus-per-node=8
#SBATCH --time=2:00:00
#SBATCH --account=project_465000546
#SBATCH --gpu-bind=none
#SBATCH --exclusive
#SBATCH --mem=0
#sbatch --dependency=afterany:$SLURM_JOBID fthmc3gev.slurm
CPU_BIND="map_ldom:3,3,1,1,0,0,2,2"
MEM_BIND="map_mem:3,3,1,1,0,0,2,2"
echo $CPU_BIND
cat << EOF > ./select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3 4 5 6 7)
export NUMA_MAP=(3 3 1 1 0 0 2 2)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUM=\${NUMA_MAP[\$SLURM_LOCALID]}
#export HIP_VISIBLE_DEVICES=\$GPU
export ROCR_VISIBLE_DEVICES=\$GPU
echo RANK \$SLURM_LOCALID using GPU \$GPU
echo NUMA \$SLURM_LOCALID using NUMA \${NUM}
echo numactl -m \$NUM -N \$NUM \$*
exec numactl -m \$NUM -N \$NUM \$*
EOF
cat ./select_gpu
chmod +x ./select_gpu
root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_SMP_SINGLE_COPY_MODE=CMA
export MPICH_GPU_SUPPORT_ENABLED=1
#cfg=`ls -rt ckpoint_*lat* | tail -n 1 `
#traj="${cfg#*.}"
#cfg=`ls -rt ckpoint_*lat* | tail -n 1 `
traj=0
vol=32.32.32.64
mpi=1.2.2.2
PARAMS="--mpi $mpi --accelerator-threads 16 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol"
#HMCPARAMS="--StartingType CheckpointStart --StartingTrajectory $traj --Trajectories 200"
HMCPARAMS="--StartingType ColdStart --StartingTrajectory $traj --Trajectories 20"
srun ./select_gpu ../FTHMC2p1f_3GeV $HMCPARAMS $PARAMS

View File

@@ -23,7 +23,7 @@ echo mpfr X$MPFR
--disable-fermion-reps \
--disable-gparity \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp"

View File

@@ -1,3 +1,5 @@
export https_proxy=http://proxy-chain.intel.com:911
module load intel-release
module load intel/mpich
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@@ -0,0 +1,42 @@
#!/bin/bash
#SBATCH --partition csi
#SBATCH --time=00:10:00
#SBATCH -A csigeneral
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --qos csi
#SBATCH --gres=gpu:4
source sourceme.sh
cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export CUDA_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec \$*
EOF
chmod +x ./select_gpu
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=no
export UCX_MEMTYPE_CACHE=n
export OMP_NUM_THREAD=8
#srun -N1 -n1 nvidia-smi
#srun -N1 -n1 numactl -H > numa.txt
srun -N1 -n1 lstopo A100-topo.pdf
# 4.35 TF/s
#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16
srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16

View File

@@ -0,0 +1,17 @@
../../configure \
--enable-comms=mpi-auto \
--enable-unified=no \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--enable-gen-simd-width=64 \
--enable-simd=GPU \
--disable-accelerator-cshift \
--disable-fermion-reps \
--disable-gparity \
CXX=nvcc \
MPICXX=mpicxx \
LDFLAGS="-cudart shared " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"

View File

@@ -0,0 +1,2 @@
module load cuda/12.2
module load openmpi

View File

@@ -0,0 +1,6 @@
HDF=$HOME/paboyle/install
LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF
#LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF

Some files were not shown because too many files have changed in this diff Show More