1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-22 16:54:48 +01:00

Compare commits

..

606 Commits

Author SHA1 Message Date
d4290a7434 finer timers in Benchmark_IO 2021-06-17 11:57:02 +01:00
Peter Boyle
92def28bd3 Update README.md 2021-06-06 04:52:05 -04:00
ca10bfa1c7 removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore) 2021-06-04 11:12:22 +01:00
Peter Boyle
0e27e3847d Remove synch 2021-06-03 04:24:19 +00:00
u61464
8cfc7342cd staggered hand unroll read coalesce 2021-05-05 14:17:18 -07:00
u61464
15ae317858 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-05-04 08:40:38 -07:00
u61464
834f536b5f Fastest option on SyCL is now std::complex 2021-05-04 08:40:18 -07:00
Peter Boyle
c332d9f08b Merge pull request #356 from felixerben/bugfix/stoutSmearing
Jamie's fix
2021-04-27 14:10:49 -04:00
cf2923d5dd Jamie's fix 2021-04-27 16:53:37 +01:00
Peter Boyle
0e4413ddde Merge pull request #355 from felixerben/bugfix/stoutSmearing
bugfix 3D stout smearing
2021-04-27 08:01:55 -04:00
009ccd581e bugfix 3D stout smearing 2021-04-26 10:36:33 +01:00
Peter Boyle
8cd4263974 Tests compile 2021-04-25 22:20:37 -04:00
Peter Boyle
d45c868656 Change interface 2021-04-25 10:53:34 -04:00
Peter Boyle
955a8113de Expose label only to reduce number of parameters 2021-04-25 10:36:38 -04:00
Peter Boyle
dbe210dd53 Open the ens_id 2021-04-25 10:25:59 -04:00
Peter Boyle
86e11743ca set twists 2021-04-20 10:19:11 -04:00
Peter Boyle
980e721f6e Update MetaData.h 2021-04-13 09:33:01 -04:00
Peter Boyle
e2a0142d87 Merge pull request #348 from AndrewYongZhenNing/develop
Conserved Tadpole Implementation for Shamir Action Only
2021-04-06 10:49:00 -04:00
895244ecc3 Merge with upstream; implemented conserved tadpole for Shamir action. 2021-04-06 13:46:33 +01:00
addeb621a7 Implemented tadpole operator for Shamir action. 2021-04-06 13:45:37 +01:00
Peter Boyle
a7fb25adf6 Make Cshift fields static to avoid repeated reallocaate overhead 2021-03-29 21:44:14 +02:00
Peter Boyle
e947992957 Improved force terms 2021-03-29 20:04:06 +02:00
Peter Boyle
bb89a82a07 Staggered coalseced read 2021-03-29 20:01:15 +02:00
Peter Boyle
8bdadbadac Cold start 2021-03-18 15:41:14 -04:00
Peter Boyle
15c50a7442 Explicit instantiate the template function 2021-03-18 15:40:42 -04:00
Peter Boyle
49b0af2c95 Update of tests to compile with the sRNG addition.
Audited the code conventions (again) with the CPS momentum denominator
and added anti periodic in time to the Test_mobius_force.cc and
tested the Test_dwf_gpforce.

Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi

with the same pdot and Udot as audited in the Integrator.h etc...

With full comments and sources for factors.
2021-03-18 09:10:02 -04:00
Peter Boyle
9c2b37218a sRNG parameter added 2021-03-18 06:24:11 -04:00
Peter Boyle
3c67d626ba Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 15:36:55 +01:00
Peter Boyle
51f506553c Read out the local ID once, and store 2021-03-12 15:33:04 +01:00
Peter Boyle
226be84937 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 09:31:50 -05:00
Peter Boyle
001814b442 updated to do list. Start adding DDHMC work items 2021-03-12 09:31:17 -05:00
Peter Boyle
db3ac67506 Update thread issue 2021-03-12 14:55:07 +01:00
Peter Boyle
da91a884ef NVCC versions found buggy added as guard 2021-03-11 23:54:53 +01:00
Peter Boyle
a71e6755e3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-11 22:43:06 +01:00
Peter Boyle
cd5891eecd Test that fails on Cuda 11.0 2021-03-11 22:34:28 +01:00
Peter Boyle
5bb7336f27 Merge pull request #347 from pjgeorg/fix-autotools-avx512
Fix inconsistent SIMD option AVX512

Thanks
2021-03-11 16:29:07 -05:00
Peter Boyle
ce1fc1f48a Possible fallback plan for Fionn's compiler bbug in nvcc 2021-03-11 22:20:53 +01:00
Peter Georg
82402c6a7c Add simd option SKL for ICC 2021-03-11 13:08:40 +01:00
Peter Georg
d9c4afe5b7 Fix inconsistent configure option AVX512
Before this change AVX512 enabled different instruction sets depending
on the compiler:

For Intel C++ Compiler Classic (ICC):
    AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL
    i.e. Intel Xeon Skylake and newer

For Intel ICX, gcc, clang:
    AVX512F, AVX512CD, AVX512ER, AVX512PF
    i.e. Intel Xeon Phi x200/x205 (KNL/KNM)

With this commit AVX512 now only enables the common instruction sets
supported by all CPUs supporting any AVX-512 instructions set:
AVX512F and AVX512CD (called COMMON-AVX512 by icc)
2021-03-11 12:58:49 +01:00
Peter Boyle
f786ff8d69 Extend test from Fionn, fails on A100 apparently 2021-03-10 14:32:06 -05:00
u61464
a651caed5f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-10 06:23:51 -08:00
u61464
0e21adb3f6 Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly.
Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp
evolves a central nervous systems.
2021-03-10 05:40:51 -08:00
Peter Boyle
58bf9b9e6d Clean up test 2021-03-10 02:45:22 +01:00
Peter Boyle
2146eebb65 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-09 04:31:46 +01:00
Peter Boyle
6a429ee6d3 2d loop hits Nvidia 16bit limit on large local vols 2021-03-09 04:31:10 +01:00
Peter Boyle
4d1ea15c79 More verbosity. The 16bit limit on Grid.y, Grid.z is annoying 2021-03-09 04:29:37 +01:00
Peter Boyle
a76cb005e0 Update Tensor_exp.h 2021-03-08 13:37:57 -05:00
Peter Boyle
a9604367c1 Merge pull request #336 from lehner/feature/gpt
Make ShmDims configurable; adjust GRID_MAX_SIMD to allow for 128 byte width on GPUs
2021-03-05 13:17:19 -05:00
Peter Boyle
d7065023cc Merge pull request #332 from mmphys/feature/mres_schur
Optional changes to Test_cayley_mres e.g. Schur solver
2021-03-05 12:47:07 -05:00
Peter Boyle
89d299ceec Merge pull request #333 from mmphys/bugfix/LatTransfer
Fix convertType for GPU in Lattice_transfer.h
2021-03-05 12:46:33 -05:00
Peter Boyle
e34eda66df Merge pull request #344 from felixerben/feature/XiToSigma
Feature/xi to sigma
2021-03-05 12:45:44 -05:00
Christoph Lehner
b24181aa4f Update Coordinate.h
Revert GRID_MAX_SIMD change
2021-03-05 16:56:58 +01:00
Peter Boyle
aa173e2998 Update README.md 2021-03-05 10:25:33 -05:00
7a19432e0b whitespace 2021-03-05 10:57:09 +00:00
9b15704290 tested and consitent 2021-03-05 10:42:32 +00:00
Michael Marshall
017f955b2d Merge branch 'develop' into feature/mres_schur
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:42:02 +00:00
Michael Marshall
f252d69eef Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:41:30 +00:00
3b06e4655e Merge branch 'develop' into feature/XiToSigma 2021-03-04 20:06:16 +00:00
d4b4de8f42 changes 2021-03-04 20:01:24 +00:00
Peter Boyle
c90beee774 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-03 23:50:29 +01:00
Peter Boyle
1eea9d73b9 Pass serial RNG around 2021-03-03 23:50:01 +01:00
u61464
679d1d22f7 Sycl happier 2021-03-03 11:21:43 -08:00
Michael Marshall
b2b5e0b98c Merge branch 'develop' into feature/mres_schur
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
  Better SIMD usage/coalescence
2021-03-03 16:15:12 +00:00
Michael Marshall
03e54722c1 Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
2021-03-03 16:13:23 +00:00
Peter Boyle
442336bd96 Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case.
Other cases to do. This now includes comms code path.
2021-03-02 14:50:51 +01:00
Christoph Lehner
9c9566b9c9 Merge pull request #23 from paboyle/develop
Sync
2021-03-01 12:33:51 +01:00
Michael Marshall
1059a81a3c Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Better SIMD usage/coalescence
2021-02-27 00:21:36 +00:00
Peter Boyle
2e61556389 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-02-26 17:52:20 +01:00
Peter Boyle
f9b1f240f6 Better SIMD usage/coalescence 2021-02-26 17:51:41 +01:00
Michael Marshall
69f41469dd Merge branch 'develop' into bugfix/LatTransfer
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-25 09:19:17 +00:00
Michael Marshall
d620b303ff Merge branch 'develop' into feature/mres_schur
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-24 18:07:27 +00:00
Peter Boyle
157fd1428d Merge pull request #342 from paboyle/feature/link-update-mask
Feature/link update mask
2021-02-24 11:29:52 -05:00
Christopher Kelly
c791cb2214 Merge branch 'develop' into feature/link-update-mask 2021-02-23 11:51:54 -05:00
Christopher Kelly
d5ab571a89 Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces
Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC
Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
2021-02-23 11:49:56 -05:00
0ed800f6e4 merge develop 2021-02-23 14:54:46 +00:00
Peter Boyle
0a32183825 Merge pull request #335 from felixerben/gpu/baryons
Gpu/baryons
2021-02-23 09:30:16 -05:00
Peter Boyle
2cacfbde2a Merge pull request #341 from DanielRichtmann/fix/minor-things
Minor fixes
2021-02-22 09:28:50 -05:00
Daniel Richtmann
c073e62e0b Correct misleading ac help string 2021-02-22 15:25:44 +01:00
Daniel Richtmann
e3d019bc2f Enable performance counting in WilsonFermion like in others 2021-02-22 15:25:40 +01:00
7ae030f585 changed back A2AUtils warning 2021-02-18 13:24:50 +00:00
86b58d5aff changed if and accelerator_for - no runtime errors any more 2021-02-18 12:04:32 +00:00
Peter Boyle
26e8b9f4a5 Merge pull request #340 from mmphys/bugfix/config
Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
2021-02-17 11:56:21 -05:00
Michael Marshall
35114c9e62 Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu 2021-02-17 13:24:15 +00:00
Peter Boyle
dfd28a85c9 Merge pull request #339 from mmphys/bugfix/config
Optional rename PACKAGE_ to GRID_ in Grid/Config.h
2021-02-15 13:53:26 -05:00
Michael Marshall
a503332924 Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working.
Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
2021-02-14 21:27:54 +00:00
Peter Boyle
1ac13ec3a7 Merge pull request #338 from paboyle/bugfix/maxnorm2
Fixed compile issues with maxLocalNorm2 for non-scalar lattices
2021-02-08 12:08:11 -05:00
Christopher Kelly
55de69a569 Fixed compile issues with maxLocalNorm2 for non-scalar lattices
maxLocalNorm2 test now reuses the random field
2021-02-08 12:03:16 -05:00
Peter Boyle
eda9ab487b MADWF 5d source option for hadrons - look at Grid of source
Abort on GPU error
2021-02-08 10:47:22 -05:00
Peter Boyle
cd99edcc5f maxLocalNorm2() 2021-02-04 18:25:49 -05:00
Christoph Lehner
4705aa541d Allow user to configure ShmDims via environment variables 2021-02-04 14:25:55 +01:00
Michael Marshall
3215d88a91 Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if 2021-02-03 15:17:03 +00:00
9b9a53f870 ... 2021-02-02 13:06:43 +00:00
Christoph Lehner
019ffe17d4 Allow for GPU vector width beyond 64 2021-02-02 11:32:23 +01:00
bc496dd844 change back benchmark_ITT 2021-01-28 14:29:56 +00:00
a673b6a54d prettify 2021-01-28 14:15:09 +00:00
1bf2e4d187 Merge branch 'develop' into gpu/baryons 2021-01-27 21:17:37 +00:00
Peter Boyle
96dd7a8fbd Flop cout matches DiRAC-ITT-2020 2021-01-27 21:14:52 +00:00
7905afa9f5 revert changes 2021-01-27 21:14:52 +00:00
712bb40650 merge develop 2021-01-27 21:14:52 +00:00
81d88d9f4d fixes 2021-01-27 21:09:51 +00:00
Michael Marshall
77063418da Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex<T>. This removes many warnings in Hadrons
Simplify the SFINAE syntax and correct convertType for iScalar
2021-01-25 15:09:36 +00:00
Michael Marshall
2983b6fdf6 Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided 2021-01-23 12:41:48 +00:00
Peter Boyle
69f1f04f74 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-01-21 21:39:59 -05:00
Peter Boyle
11a5fd09d6 Hot config 2021-01-21 21:39:41 -05:00
Peter Boyle
ff1fa98808 Fix for GPU conserveed current 2021-01-21 21:38:23 -05:00
df16202865 weird bug in 2pt function... 2021-01-19 19:25:27 +00:00
3ff7c2c02a Merge branch 'develop' into gpu/baryons 2021-01-19 12:34:13 +00:00
fc6d07897f revert changes 2021-01-19 12:32:48 +00:00
f9c8e5c8ef Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-01-19 12:30:29 +00:00
8bfa0e74f8 final version, tested on CPU and GPU 2021-01-19 12:27:57 +00:00
9b73a937e7 bugfix 2021-01-18 18:57:05 +00:00
Peter Boyle
b0339bc5a4 Merge branch 'feature/conjugate-bc-dirs' into develop 2021-01-15 09:28:39 -05:00
Peter Boyle
3c23a947cc Fixed test for very much non-unit det 2021-01-15 09:16:02 -05:00
Peter Boyle
56111bb823 Merge branch 'develop' into feature/conjugate-bc-dirs 2021-01-14 21:01:22 -05:00
Peter Boyle
99445673f6 Gparity fix, and plaquette IO 2021-01-14 21:00:36 -05:00
Peter Boyle
97a59643f7 Red black coarse space 2021-01-14 20:49:13 -05:00
Peter Boyle
579595f547 Red black on coarse space 2021-01-14 20:48:35 -05:00
Peter Boyle
281ac5fc12 Red black support on coars 2021-01-14 20:48:08 -05:00
Peter Boyle
d8fa903b02 G5 on coarse spaces 2021-01-14 20:47:28 -05:00
Peter Boyle
eaff0f3aeb Gamma5 on coaree spaces 2021-01-14 20:46:58 -05:00
Peter Boyle
e8e20c01b2 Coarsened vector test 2021-01-14 20:46:21 -05:00
Peter Boyle
a4afc3ea2a Red black coarse space 2021-01-14 20:44:16 -05:00
fa12b9a329 bugfix 2021-01-13 10:04:17 +00:00
45fc7ded3a test for sum 2021-01-12 09:10:37 +00:00
74de2d9742 whitespace changes 2021-01-08 18:28:36 +00:00
e759367d42 tested and working 2021-01-08 18:04:50 +00:00
Christoph Lehner
299d0de066 Merge pull request #21 from paboyle/develop
Sync
2020-12-22 20:59:15 +01:00
Peter Boyle
3fe75bc7cb Merge pull request #329 from nmeyer-ur/feature/a64fx-3
Revised dslash/dwf kernels for A64FX
2020-12-20 08:17:15 -05:00
Nils Meyer
45d49d8648 clean up 2020-12-19 03:35:18 +01:00
Nils Meyer
6013183361 removed Asm impls 2020-12-19 03:25:01 +01:00
Nils Meyer
4b882e8056 fixed lost bracket 2020-12-19 03:09:20 +01:00
Nils Meyer
3f9ae6e7e7 Merge branch 'develop' into feature/a64fx-3 2020-12-19 02:37:11 +01:00
Nils Meyer
909acd55cd vnum variant for prefetches 2020-12-19 02:00:22 +01:00
Nils Meyer
4dd9e39e0d up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 2020-12-19 00:54:31 +01:00
Christoph Lehner
b4c1317ab4 Merge pull request #22 from DanielRichtmann/feature/clover-access-specifier
Clover access specifier
2020-12-18 16:20:19 +01:00
f36d6f3923 compiles on GPU. 3pt still wrong!!!! 2020-12-17 17:04:08 +00:00
Peter Boyle
7adb253e25 Merge pull request #328 from mmphys/feature/mrespatch
Enable existing conserved current code for CUDA
2020-12-17 11:10:29 -05:00
808f1e0e8c merge develop 2020-12-15 16:33:29 +00:00
Michael Marshall
873519e960 Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration 2020-12-14 16:06:10 +00:00
Peter Boyle
9aec4a3c26 SYCL 2020-12-10 02:11:17 -08:00
Daniel Richtmann
c438118fd7 Change access specifier of clover fields in order to allow deriving classes to access these 2020-12-08 14:42:11 +01:00
Peter Boyle
70510d151b Merge pull request #327 from paboyle/feature/gparity_twist_GPU
Feature/gparity twist gpu
2020-12-07 12:02:20 -05:00
Christopher Kelly
9e7bacb5a4 Merge branch 'develop' into feature/gparity_twist_GPU 2020-12-07 11:55:39 -05:00
Christopher Kelly
2ef1fa66a8 Improved performance of G-parity kernel for GPUs by simplifying multLink implementation 2020-12-07 11:53:35 -05:00
Peter Boyle
cf76741ec6 Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 ) 2020-12-03 03:47:11 -08:00
Peter Boyle
497e7c1c40 Duplicate code 2020-12-02 17:55:30 -08:00
Peter Boyle
888eacd3b8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-24 21:46:33 -05:00
Peter Boyle
321f0f51b5 Project to SU(N) 2020-11-24 21:46:10 -05:00
Christoph Lehner
17ec9c5545 Merge pull request #20 from paboyle/develop
Sync
2020-11-24 12:20:43 +01:00
Peter Boyle
30ad9578a2 Merge branch 'lehner-feature/gpt' into develop 2020-11-24 06:10:24 -05:00
Peter Boyle
9dce101586 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into lehner-feature/gpt 2020-11-24 06:10:16 -05:00
Peter Boyle
97e264d0ff Christoph's changes 2020-11-23 15:46:11 +00:00
Peter Boyle
683a5e5bf5 Stencil use host vector for integera table on enable-shared=no and mirror it on device 2020-11-23 15:39:51 +00:00
Peter Boyle
d4861a362c Stencil use non-UVM memory for look up table on enable-shared=no 2020-11-23 15:38:49 +00:00
Peter Boyle
5ff3eae027 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-20 13:14:44 -05:00
Peter Boyle
147dc15d26 Update 2020-11-20 13:13:59 -05:00
Christoph Lehner
c61ea72949 Merge pull request #19 from paboyle/develop
Sync
2020-11-20 17:31:13 +01:00
Peter Boyle
86e8b9fe38 ALLOC_ALIGN removed 2020-11-20 17:07:16 +01:00
Peter Boyle
612e468889 Configurable ALLOC_ALIGN and ALLOC_CACHE 2020-11-20 16:48:28 +01:00
Christoph Lehner
4ea8d128c2 Merge pull request #18 from paboyle/develop
Sync
2020-11-20 15:36:50 +01:00
Peter Boyle
e49b7f2f88 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-19 19:24:41 +01:00
Peter Boyle
aace3d47b9 partial work in progress 2020-11-19 19:24:14 +01:00
Peter Boyle
d5049949a4 Starting to fix reunitarise 2020-11-19 19:23:41 +01:00
Peter Boyle
f1c7480e3c Warning remove 2020-11-19 19:23:03 +01:00
Peter Boyle
5adae5d6ff Unused variable remove 2020-11-19 19:22:12 +01:00
Peter Boyle
a8412ace05 Merge pull request #317 from i-kanamori/develop
adding an error check for input: Parameters.StartingType
2020-11-18 23:09:40 -05:00
Peter Boyle
9fd1c2ad4b Merge pull request #325 from DanielRichtmann/feature/threaded-clover-inversion
Threaded clover term inversion
2020-11-18 23:08:37 -05:00
Peter Boyle
4cf3575353 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-18 03:07:36 +00:00
Peter Boyle
804a810d68 Wildcard mismatch 2020-11-18 03:06:53 +00:00
Peter Boyle
8fcb392e24 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-17 04:51:31 -08:00
Peter Boyle
dd8d70eeff Build without LIME 2020-11-17 04:41:15 -08:00
Peter Boyle
aa8aba6543 --shm-force-mpi 2020-11-16 20:15:50 -05:00
Peter Boyle
13df14f96e Switch off SHM paths with --disable-shm 2020-11-16 18:07:15 -05:00
Peter Boyle
3aab983760 Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) 2020-11-16 17:13:58 +01:00
Peter Boyle
9c4dcc5ea3 Merge branch 'master' into develop 2020-11-16 16:34:57 +01:00
Peter Boyle
a1063ddbb9 Update options and simplify 2020-11-13 04:11:03 +01:00
Peter Boyle
18ef8056ec Hide Shared Memory 2020-11-13 04:10:40 +01:00
Peter Boyle
1c673977fa Must ask for COMMMS_THREADS 2020-11-13 03:59:36 +01:00
Peter Boyle
e9bc748828 Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel 2020-11-13 03:58:34 +01:00
Peter Boyle
f48156529b Work on 2,2,2,8 ranks 2020-11-13 03:57:58 +01:00
Peter Boyle
d05ce01809 TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED 2020-11-13 03:52:19 +01:00
Peter Boyle
cf23eff60e Device to Device, Memset, cannot assume UVM == Communicable 2020-11-13 03:51:08 +01:00
Peter Boyle
6e313575be Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac 2020-11-13 03:50:16 +01:00
Peter Boyle
b13d1f7238 TOFU compat flag to help Isaaku 2020-11-13 03:49:44 +01:00
Peter Boyle
b5e7945dd9 Option for host or device Cshift implementation 2020-11-13 01:38:54 +01:00
Peter Boyle
7535566f54 Option for bounce through the SHM buffer 2020-11-12 22:54:27 +01:00
Peter Boyle
50b808ab33 Configure option between host and device 2020-11-12 22:28:12 +01:00
Peter Boyle
f16c2665f5 Host memory explict 2020-11-12 20:29:58 +01:00
Peter Boyle
41e28015ae Volume divisible guarantee 2020-11-07 13:32:16 +01:00
3594ce877b speedup in Sigma-to-nucleon 2020-11-03 20:04:30 +00:00
9bae6b889a speedup in Sigma-to-nucleon 2020-11-03 20:03:09 +00:00
4014dfd5b9 first tested version 2020-11-03 16:13:08 +00:00
67023c334b bugfix 2020-11-03 13:07:37 +00:00
a3de7026c8 bugfix 2020-11-03 12:51:50 +00:00
ee11678b1f added Xi-to-Sigma rare decays 2020-11-03 12:41:35 +00:00
Peter Boyle
a0ccbb3bd6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-01 01:16:35 +00:00
Peter Boyle
5eeabaa2bb HIP fix 2020-11-01 01:16:01 +00:00
Peter Boyle
00d0d6d008 Hip Free managed 2020-10-31 18:14:31 -04:00
Peter Boyle
537a9f7030 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-31 18:13:30 -04:00
Peter Boyle
cc9c993f74 Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt 2020-10-31 18:12:47 -04:00
Peter Boyle
d10422ded8 Test project on group 2020-10-31 18:12:30 -04:00
Peter Boyle
f313565a3c HiP compile 2020-10-31 12:12:40 +00:00
Daniel Richtmann
b3881d2636 Thread inversion of clover term 2020-10-30 16:18:58 +01:00
61d5860b46 Merge pull request #318 from rrhodgson/feature/BaryonSpinMat
Added untraced baryon contraction code
2020-10-28 18:39:59 +00:00
52d17987dc BaryonUtils.h updated debug output 2020-10-23 11:41:08 +01:00
19d8bba97d BaryonUtils function naming change 2020-10-21 11:58:53 +01:00
463d72d322 Added untraced baryon contraction code 2020-10-19 16:13:28 +01:00
d060341168 add an error check for Parameters.StartingType 2020-10-16 21:39:17 +09:00
c772bcd514 Merge https://github.com/paboyle/Grid into develop 2020-10-16 20:30:32 +09:00
Peter Boyle
3362f8dfa0 happy compile 2020-10-14 22:59:41 -04:00
Peter Boyle
bf3c9857e0 Closure changes 2020-10-14 21:37:14 -04:00
Peter Boyle
a88b3ceca5 Closure cases 2020-10-14 21:33:51 -04:00
Peter Boyle
aa135412f5 toComplex, toReal 2020-10-13 22:25:01 -04:00
Peter Boyle
9945399e60 Reaality issues fix by drop from ET 2020-10-13 22:24:32 -04:00
Peter Boyle
5eeffa49e8 Reality forced included 2020-10-13 22:23:57 -04:00
Peter Boyle
3f06209720 Pretty print 2020-10-13 22:18:51 -04:00
Peter Boyle
12e239dd9f Merge branch 'release/dirac-ITT-2020' 2020-10-13 13:38:29 -04:00
Peter Boyle
af2301afbb Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2020-10-13 11:42:12 -04:00
Peter Boyle
f98856a26f Merge pull request #314 from smangham/issue_readme_precision
Fix for deprecated configure options in documentation (issue #313)
2020-10-13 11:41:38 -04:00
Sam Mangham
d55cc5b380 Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision. 2020-10-12 12:33:13 +01:00
c2b688abc9 Benchmark_IO: reducing max local volume to 32^4 2020-10-10 16:52:56 +01:00
b0d61b9687 Benchmark_IO cleaner output 2020-10-09 21:46:45 +01:00
5f893bf9af Benchmark_IO procurement sizes 2020-10-09 21:31:59 +01:00
0e17bd6597 I/O benchmark cleanup 2020-10-09 20:29:57 +01:00
22caa158cc multi-pass I/O benchmark, with statistic and robustness summary 2020-10-09 20:29:40 +01:00
b24a504d7c hook to access last parallel I/O performance measurement 2020-10-09 20:28:54 +01:00
Peter Boyle
992ef6e9fc more runtime 2020-10-08 22:19:20 -04:00
Peter Boyle
f32a320bc3 Single prec benchmark in double prec compile 2020-10-08 19:52:08 -04:00
Peter Boyle
5f0fe029d2 Improve meemory benchmarks for GPU (avoid host mem ping pong) 2020-10-08 19:51:28 -04:00
6b1486e89b fixing number of colours defaulting to 4 in most cases 2020-10-08 16:31:24 +01:00
Peter Boyle
3f9c427a3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-07 13:12:57 -04:00
Peter Boyle
d201277652 Expose Nc as a compile time configure option.
Remove precision option
2020-10-07 13:07:00 -04:00
fdda7cf9cf Merge branch 'feature/benchmark-io-update' into develop 2020-10-07 15:57:53 +01:00
e22d30f715 Merge branch 'develop' into feature/benchmark-io-update 2020-10-07 15:56:39 +01:00
1ba25a0d8c more I/O benchmark code cleaning 2020-10-07 15:38:41 +01:00
9ba3647bdf script to convert I/O benchmark logs to CSV 2020-10-07 15:35:03 +01:00
5ee832f738 I/O benchmark code cleaning 2020-10-07 15:31:51 +01:00
467deee46f Merge branch 'debug_512' into develop 2020-10-07 15:18:44 +09:00
Peter Boyle
35a69a5133 SU4 x SU4 2020-10-06 21:48:35 -04:00
e9c5a271a8 fixing potential issues with log alignment and timer I/O 2020-10-06 17:58:16 +01:00
acac2d6938 standard C/C++ I/O in benchmark 2020-10-06 17:57:00 +01:00
97db2b8d20 add reordring of random number generator in IO 2020-10-06 17:25:59 +09:00
Christoph Lehner
80fd6ab407 Merge pull request #17 from paboyle/develop
sync upstream
2020-10-06 09:01:39 +02:00
Christoph Lehner
5534921bee Merge pull request #16 from DanielRichtmann/feature/gpt-coarsenedmatrix
Enable checkerboard operations for CoarsenedMatrix
2020-10-01 10:55:13 +02:00
Peter Boyle
ace9cd64bb dpcpp happy 2020-09-29 08:03:46 -07:00
Peter Boyle
a3e2aeb603 dpcpp options happiness 2020-09-29 06:50:10 -07:00
Peter Boyle
049dd25785 Revert accidental commit thanks michael 2020-09-23 04:13:50 -04:00
Peter Boyle
d43d372294 Merge pull request #311 from mmphys/bugfix/MPIasynch
Asynchronous calls removed - reflect this in Communicator_none.cc
2020-09-22 10:41:48 -04:00
Michael Marshall
b71a081cba Asynchronous calls removed - reflect this in Communicator_none.cc
(Opportunistic doc update - OpenMP support on Mac OS)
2020-09-21 09:33:23 +01:00
Peter Boyle
c48909590b MPI asynch call removal 2020-09-17 20:47:32 +01:00
Peter Boyle
446ef40570 HIP IPC 2020-09-17 20:31:46 +01:00
Peter Boyle
81441e98f4 HIP runs sensible 2020-09-16 03:35:03 +01:00
Peter Boyle
ecd3f890f5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-16 02:30:14 +01:00
Peter Boyle
1c881ce23c HIP does not like half2 visible members x and y so must define own Half2 2020-09-16 02:28:33 +01:00
Peter Boyle
dacbbdd051 Hip Happy Birthday 2020-09-16 00:37:02 +01:00
Peter Boyle
2859955a03 HIP requires "inline" 2020-09-16 00:36:13 +01:00
Peter Boyle
cc220abd1d inline for HIP 2020-09-16 00:35:38 +01:00
Peter Boyle
d1c0c0197e HipCC requires inline on definition 2020-09-16 00:35:06 +01:00
Peter Boyle
fd9424ef27 innlines required to make HIP happy 2020-09-16 00:34:32 +01:00
Peter Boyle
a5c35c4024 Make HIP / Vega happy 2020-09-16 00:33:53 +01:00
Peter Boyle
e03b64dc06 HIP default flaags to work on ROCM 2020-09-16 00:33:09 +01:00
Peter Boyle
4677c40195 HIP improvements 2020-09-16 00:32:27 +01:00
Peter Boyle
288c615782 Hip improvements 2020-09-16 00:31:50 +01:00
Peter Boyle
48e81cf6f8 Hip Pragmas 2020-09-16 00:31:03 +01:00
Christoph Lehner
5cffa05c7e remove slab allocator file 2020-09-13 14:06:25 -04:00
Christoph Lehner
d50a2164d7 remove slab allocator 2020-09-13 14:06:06 -04:00
Christoph Lehner
32ff766dbd fix evict scheme, slab alloc 2020-09-13 14:02:53 -04:00
Christoph Lehner
01652d8cfe SlabAllocator 2020-09-13 05:56:02 -04:00
Daniel Richtmann
4d2dc7ba03 Enable even-odd for CoarsenedMatrix 2020-09-11 20:32:02 +02:00
Christoph Lehner
51d1beb1f3 Merge pull request #15 from paboyle/develop
Sync with upstream
2020-09-07 14:20:33 +02:00
Peter Boyle
65b724bb5f 2 level hddcr 2020-09-03 21:46:43 -04:00
Peter Boyle
6dbd117aa5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-03 20:30:49 -04:00
Peter Boyle
198b29f618 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-03 20:29:54 -04:00
Peter Boyle
a8309638d4 UVM check in MPI calls 2020-09-03 20:29:26 -04:00
Peter Boyle
f98a4e880e Merge pull request #310 from kostrzewa/accelerator_vector_stream_op_no_backspace
do not use backspace in AcceleratorVector (Coordinate) output stream operator
2020-09-03 20:24:59 -04:00
Peter Boyle
8244caff25 Remove the asynchronous non-Stencil calls. 2020-09-03 18:52:55 -04:00
Peter Boyle
bcd7895362 Include cuda.h 2020-09-03 15:49:13 -04:00
Peter Boyle
85b1c5df39 A never hit case that is not 100% confident is asserted for safety 2020-09-03 15:48:16 -04:00
Peter Boyle
b4255140d6 Stale data member eliminated 2020-09-03 15:47:46 -04:00
Peter Boyle
0c3095e173 Comms buffers to device memory 2020-09-03 15:45:35 -04:00
Peter Boyle
d3ce60713d UVM, Device and Lattice/aligned allocators 2020-09-03 15:44:13 -04:00
Peter Boyle
eac1f08b7b Close expressions passed as an argument 2020-09-01 15:30:33 -04:00
Peter Boyle
1654c4f3c0 Closure improved 2020-09-01 15:29:45 -04:00
Peter Boyle
8807d998bc closure improved 2020-09-01 15:29:11 -04:00
Peter Boyle
5791021dcd Speed up Cshift more with coalesced 2020-09-01 15:28:15 -04:00
Peter Boyle
c273fb051c Peek poke laattice 2020-09-01 15:27:59 -04:00
Peter Boyle
c545530170 little worry large Nbasis doesnt compile GPU 2020-09-01 00:14:33 -04:00
Peter Boyle
d982a5b6d5 Fix coaarsened 2020-09-01 00:14:04 -04:00
Peter Boyle
15ca8637f3 No norms in HermOp 2020-09-01 00:13:32 -04:00
Peter Boyle
cbc995b74c Made better interface 2020-09-01 00:12:54 -04:00
Peter Boyle
8b74174d74 Eigen tensor serialisatiino happy undeer GPU. Regret agreeing to let us couple Eigen types to Grid IO 2020-09-01 00:03:26 -04:00
Peter Boyle
e21fef17df real and imag part not in ET 2020-08-31 23:56:26 -04:00
Peter Boyle
3d27708f07 Basic where test 2020-08-31 23:55:49 -04:00
Peter Boyle
b918744184 Prettificatoin 2020-08-31 23:54:46 -04:00
Peter Boyle
7d14a3c086 Where working 2020-08-31 23:53:46 -04:00
Peter Boyle
e14a84317d GPU math unary calls 2020-08-31 23:50:49 -04:00
Peter Boyle
6c31b99f1f I knew coupling Eigen Tensor to Grid serialisation was a bad iddea.
Now the complex is different on GPU creates probblems
2020-08-31 23:49:19 -04:00
Peter Boyle
9522dcd611 Remove dead commented ouot coode 2020-08-31 23:40:29 -04:00
Peter Boyle
ed469898dc coalesced ET expressions 2020-08-31 23:38:40 -04:00
Peter Boyle
1eee94a809 Sorting real/im in read coalesced GPU ET 2020-08-31 23:36:49 -04:00
Bartosz Kostrzewa
54523369a3 do not use backspace in Coordinate output stream operator 2020-08-31 19:39:36 +02:00
Peter Boyle
a98c91c2a5 Merge pull request #309 from kostrzewa/format_benchmark_wilson_sweep
Format benchmark wilson sweep
2020-08-31 12:43:46 -04:00
Bartosz Kostrzewa
a9b92867a8 use tabulator 2020-08-31 18:41:17 +02:00
Bartosz Kostrzewa
65920faeba correct formatting of Benchmark_wilson_sweep output 2020-08-31 18:39:27 +02:00
Christoph Lehner
249e2db87d Merge pull request #14 from DanielRichtmann/feature/gpt-coarsenedmatrix
Expose more functions in CMat
2020-08-27 15:18:56 +02:00
Daniel Richtmann
cf3535d16e Expose more functions in CMat 2020-08-27 14:06:48 +02:00
Christoph Lehner
d61ee817f4 Merge pull request #13 from DanielRichtmann/feature/gpt-coarsenedmatrix
Changes needed for GPT MG
2020-08-27 12:11:06 +02:00
Peter Boyle
3448b7387c Almost there to coalesced ET 2020-08-26 17:04:49 -04:00
Peter Boyle
47b89d2739 Pragma protection improvementt 2020-08-26 17:04:27 -04:00
Christoph Lehner
2a75516330 state MPI/SLURM message only on world_rank zero 2020-08-26 12:34:17 -04:00
Daniel Richtmann
b2087f14c4 Fix CoarsenedMatrix regarding illegal memory accesses
Need a reference to geom since the lambda copies the this pointer which points to host memory, see
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture
- https://devblogs.nvidia.com/new-compiler-features-cuda-8/
2020-08-24 17:46:47 +02:00
Daniel Richtmann
dd1ba266b2 Fix mapping between dir + disp and point in CMat 2020-08-24 17:46:46 +02:00
Daniel Richtmann
1292d59563 Add a typedef + broaden interface of CMat 2020-08-24 17:46:45 +02:00
Christoph Lehner
9877ed9bf8 Merge pull request #12 from paboyle/develop
Sync
2020-08-22 16:35:35 +02:00
Christoph Lehner
f0dc0f3621 fix compile issue on Qpace3 2020-08-22 13:57:33 +02:00
Peter Boyle
1efe30d6cc SLurm stop nodes using same GPU 2020-08-21 02:02:53 +02:00
Peter Boyle
0b787e9fe0 Avoid namespaec collision to make gcc happy 2020-08-20 22:23:29 +02:00
Peter Boyle
37ec4b241c Default thread count sensible 2020-08-20 22:12:31 +02:00
Christoph Lehner
63b0a19f37 Merge pull request #11 from paboyle/develop
Sync
2020-08-20 20:53:39 +02:00
Peter Boyle
90ea7dfa99 Accelerator loops for device resident comms buf 2020-08-19 22:40:44 +02:00
Peter Boyle
f866d7c33e Merge pull request #307 from lehner/feature/gpt
Merged Nils's A64FX and minor fixes (MemoryManager::InitMessage, Tensor_index zeroit, ...)
2020-08-18 23:27:21 -04:00
Christoph Lehner
542bdef198 cleanup comments 2020-08-14 18:39:44 +02:00
Christoph Lehner
06007db3d9 true shm_none implementation with GPUs that disables the use of device shared memory for the stencils 2020-08-14 18:37:00 +02:00
Christoph Lehner
12e6059a70 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2020-08-13 16:16:52 +02:00
Christoph Lehner
dbaa24ebf6 further GPU memory access fixes (with this GPT passes all single-rank tests on non-summit GPUs) 2020-08-13 16:14:15 +02:00
Peter Boyle
3276aa67dc Update 2020-08-12 14:15:53 -04:00
Christoph Lehner
3b30b9f0c0 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2020-08-06 16:59:17 +02:00
Christoph Lehner
69db4816f7 fix variable capture in Scatter_plane_merge on accelerators 2020-08-06 16:57:16 +02:00
Christoph Lehner
3abe09025a when using SHM_NONE allow multiple ranks per node but without using shared memory 2020-08-06 14:42:38 +02:00
Christoph Lehner
e33878e0de Trigger re-run of CI 2020-08-06 11:50:24 +02:00
Christoph Lehner
27b4fbf3f0 assert for forbidden code path and fix check for faster CPU codepath in basisRotate 2020-08-03 07:57:33 -04:00
Christoph Lehner
968a90633a Zero -> zeroit in Tensor_index 2020-07-31 02:07:17 -04:00
Christoph Lehner
6365a89ba3 create separate InitMessage for MemoryManager that can be called after communicator setup 2020-07-30 07:25:05 -04:00
Christoph Lehner
ddbb008694 Merge pull request #10 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-07-30 13:12:09 +02:00
Christoph Lehner
7997e0a449 Merge branch 'feature/gpt' into feature/gpt-sycl 2020-07-30 13:11:31 +02:00
Christoph Lehner
197612bc7a fast cpu basisRotate and other small cleanups 2020-07-30 07:08:54 -04:00
Christoph Lehner
0e88bf4bff remove Nils's default pragma 2020-07-29 10:24:35 -04:00
Christoph Lehner
3e64d78469 include versions.h again and add back asserts in Test_simd 2020-07-29 10:18:05 -04:00
Christoph Lehner
2004611def Merge pull request #9 from nmeyer-ur/feature/a64fx-2
Feature/a64fx 2
2020-07-29 14:54:20 +02:00
Christoph Lehner
a2868c96a4 Merge pull request #8 from paboyle/develop
Doc recompile
2020-07-29 14:10:07 +02:00
Peter Boyle
7cf7f11e1a Doc recompile 2020-07-22 14:44:11 -04:00
nmeyer-ur
ea7f8fda5e fix typo 2020-07-22 09:34:05 +02:00
nmeyer-ur
906b78811b exit in Init when using --comms-overlap 2020-07-22 08:57:01 +02:00
Christoph Lehner
97703b181b Merge pull request #7 from paboyle/develop
Merge current develop
2020-07-12 16:24:53 +02:00
nmeyer-ur
d9474c6cb6 compiler-independent build using --enable-simd=A64FX 2020-07-09 10:07:02 +02:00
nmeyer-ur
bbd145382b enable --enable-simd=A64FX in configure 2020-07-08 12:43:51 +02:00
nmeyer-ur
1b08cb7300 Merge branch 'develop' into feature/a64fx-2 2020-07-08 08:18:18 +02:00
nmeyer-ur
337d9dc043 move barrier in Benchmark_wilson 2020-07-08 08:13:40 +02:00
nmeyer-ur
8726e94ea7 merge upstream develop 2020-07-07 20:26:47 +02:00
nmeyer-ur
67db4993c2 reset head, update SVE readme 2020-07-07 19:54:52 +02:00
f1f655d92b Merge pull request #304 from Heinrich-BR/develop
ScalarImpl.h updates
2020-07-06 10:16:03 +01:00
43334e88c3 Tiny change in a comment for clarity 2020-07-04 16:11:16 +01:00
4f1e66b044 Fixed HMC SU(N) integrator which was causing fields to leave Lie Algebra manifold for N>2 2020-07-04 03:53:06 +01:00
nmeyer-ur
fd3c8b0e85 correct build instructions qp4 2020-07-01 09:00:38 +02:00
nmeyer-ur
1635c263ee disable TOFU by default 2020-06-30 19:27:08 +02:00
64fe5b21b4 Merge pull request #298 from rrhodgson/feature/baryon
Update baryon 2pt and add 3pt function
2020-06-29 18:45:00 +01:00
Peter Boyle
ee9889821d Runs through to coarse space solve 2020-06-29 12:59:52 -04:00
eb470aa6dc Update to baryon and added comments/fix whitespace 2020-06-29 09:43:01 +01:00
77af9a3ddc Baryon revert sign 2020-06-26 10:08:42 +01:00
102089798c BaryonUtils: update to autoView 2020-06-25 16:41:58 +01:00
39cea8b5a7 Merge branch 'develop' into feature/baryon 2020-06-25 16:24:07 +01:00
a65f66d2db Merge branch 'feature/baryon3pt' into feature/baryon 2020-06-25 16:20:59 +01:00
Peter Boyle
936c5ecf69 Reduction GPU no compile fix 2020-06-24 17:28:31 -04:00
Peter Boyle
22cfbdbbb3 Boost precision in inner products in single 2020-06-24 12:52:31 -04:00
Peter Boyle
093d1ee21b Force initial values 2020-06-24 08:54:49 -04:00
Peter Boyle
d6ba2581ce Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-06-24 08:25:08 -04:00
Peter Boyle
577c064184 Memory manager initialise earlier 2020-06-24 08:24:38 -04:00
Peter Boyle
2ff1fa6fad UVM used shared for CPU alloccations andd ddont migrate 2020-06-23 22:14:56 -04:00
Peter Boyle
70be1bd8be Adding code under development 2020-06-23 10:24:21 -04:00
4ef50ba31f Baryon speedup 2020-06-23 11:44:20 +01:00
3e97a26f90 BaryonGamm3pt threads -> accelerator 2020-06-23 11:35:32 +01:00
599f28f6ef Baryon bug fixes 2020-06-23 11:10:26 +01:00
Peter Boyle
c48da35921 Memory Vector UVM and Lattice alignedAllocator separate 2020-06-22 20:21:53 -04:00
Peter Boyle
6c5fa8dcd8 Aligned allocate on CPU put through this interface 2020-06-20 14:34:29 -04:00
Peter Boyle
0d2f913a1a String.h for linux 2020-06-20 09:37:31 -04:00
Christoph Lehner
5b117865b2 Merge pull request #6 from paboyle/sycl
Sycl
2020-06-20 09:44:44 +02:00
Peter Boyle
1a74816c25 Hopeefully fixed 2020-06-19 17:50:52 -04:00
Peter Boyle
73de335256 Merge branch 'develop' into sycl 2020-06-19 17:44:16 -04:00
Peter Boyle
228fd450ce Typo fix (excusee - my keyboard is starting to break) 2020-06-19 17:36:05 -04:00
Peter Boyle
b949cf6b12 PeekLocal needs a view to keep thread safe.
ALLOCATION_CACHEE reenable
2020-06-19 17:13:27 -04:00
Peter Boyle
11bc1aeadc TThread count defaultt to fastest 2020-06-19 14:30:35 -04:00
Peter Boyle
66005929af Set up the cache size on all ranks 2020-06-19 12:50:54 -04:00
Christoph Lehner
05bbc49a99 Edge case in GetShmDim check 2020-06-19 12:01:23 -04:00
Peter Boyle
ff7c847735 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-19 01:22:16 -04:00
Peter Boyle
1aa988b2af Comms overlap fix UVM case 2020-06-19 01:21:14 -04:00
Peter Boyle
edf17708a8 Range improvement 2020-06-18 22:41:06 -04:00
Christoph Lehner
81a8209749 ConvertType for blockInnerProduct 2020-06-18 11:53:21 -04:00
nmeyer-ur
a87e45ba25 SVE readme update 2020-06-18 11:23:08 +02:00
nmeyer-ur
465856331a switch back to serialized; wrong results on single too 2020-06-15 15:39:39 +02:00
nmeyer-ur
cc958aa9ed switch back to standard MPI_init due to wrong results in Benchmark_wilson using comms-overlap 2020-06-15 14:21:38 +02:00
Peter Boyle
f46f029dbb Merge pull request #292 from lehner/feature/gpt-sycl
Catch edge case in SharedMemoryMPI::GetShmDims; Change default units …
2020-06-14 13:43:27 -04:00
Christoph Lehner
3dccd7aa2c Catch edge case in SharedMemoryMPI::GetShmDims; Change default units to consistent MB in init args; Want last element not past last element in MemoryManagerCache.cc 2020-06-14 13:26:01 -04:00
nmeyer-ur
a25e4b3d0c pred 32/64 for float/double instead of 8 in VLA patch 2020-06-13 14:44:37 +02:00
nmeyer-ur
d1210ca12a switch to double/float instead of float64_t/float32_t in VLA patch 2020-06-13 13:59:32 +02:00
nmeyer-ur
36ea0e222a type traits for ComplexF/D in VLA patch; cosmetics in VLS intrinsics 2020-06-13 13:42:35 +02:00
Peter Boyle
65e6e7da6f Merge pull request #291 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-06-12 20:42:32 -04:00
Christoph Lehner
b5e87e8d97 summit compile fixes 2020-06-12 18:16:12 -04:00
Christoph Lehner
5f5807d60a cleanup 2020-06-12 14:48:23 -04:00
nmeyer-ur
92281ec22d add 3 op Mult for VLA 2020-06-12 18:49:05 +02:00
nmeyer-ur
87266ce099 comment out fcmla in vector types: need also MultAddReal 2020-06-12 18:37:19 +02:00
nmeyer-ur
2a23f133e8 reenable fcmla for VLA 2020-06-12 17:30:38 +02:00
nmeyer-ur
8dbf790f62 correct tbl2 for sp 2020-06-12 17:12:34 +02:00
nmeyer-ur
2402b4940e vec_imm in float 2020-06-12 15:17:38 +02:00
nmeyer-ur
2111052fbe apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7 2020-06-12 14:49:19 +02:00
Christoph Lehner
7974acff54 merged sycl to feature-gpt 2020-06-12 06:49:38 -04:00
f0d17d2b49 Added Baryon3pt code 2020-06-12 11:35:52 +01:00
244c003a1b Updated Baryon code 2020-06-12 11:00:25 +01:00
0174f5f742 look for librt when using shm=shmopen 2020-06-11 16:50:43 +01:00
Peter Boyle
32b2b59be4 Offload 2020-06-10 20:36:26 -04:00
Peter Boyle
86bb0cc24b Keep on GPU 2020-06-10 20:00:00 -04:00
Peter Boyle
84c19587e7 Offload 2020-06-10 19:59:31 -04:00
Peter Boyle
237ce92540 Offload loops 2020-06-10 19:59:11 -04:00
Peter Boyle
a7ffc61e82 acceleratorSIMTlane() 2020-06-10 19:58:33 -04:00
Peter Boyle
fd97f64612 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-10 12:58:13 -04:00
Peter Boyle
8720aecb80 Offload more loops 2020-06-10 12:57:55 -04:00
Peter Boyle
cdf0a04fc5 Merge branch 'develop' into sycl 2020-06-09 04:00:12 -04:00
Peter Boyle
616d3dd737 CCommpile updates 2020-06-08 18:57:41 -04:00
Peter Boyle
8b066baca8 Implement transient mechanism 2020-06-08 18:28:53 -04:00
Peter Boyle
e97f3688db Fix the HMC issue - kernel was launchnig asynchronously 2020-06-08 17:01:15 -04:00
nmeyer-ur
433766ac62 revert Add/SubTimesI and prefetching in stencil
This reverts commit 9b2699226c.
2020-06-08 12:02:53 +02:00
nmeyer-ur
93a37c8f68 test prefetch to L2 in stencil 2020-06-08 09:39:50 +02:00
Peter Boyle
89a1e78390 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 23:20:37 -04:00
Peter Boyle
ffbb3fc02c Merge pull request #287 from felixerben/baryon-cleaner
slightly cleaner baryon 2pt code
2020-06-05 22:54:52 -04:00
Peter Boyle
5a73ef3647 Minor tweak to compile 2020-06-05 21:50:15 -04:00
Peter Boyle
87e5d2f4b7 Merge branch 'sycl' of https://www.github.com/paboyle/Grid into sycl 2020-06-05 17:32:21 -07:00
Peter Boyle
d720f10758 Liink error fix 2020-06-05 17:29:20 -07:00
Peter Boyle
14fcd0912a Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 19:14:17 -04:00
Peter Boyle
3111c0bd4f Single precisiono hardwire 2020-06-05 19:13:27 -04:00
Peter Boyle
e03064490e Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 18:53:39 -04:00
Peter Boyle
1a4c8c3387 Global edit with change to View usage. autoView() creates a wrapper object that closes the view when scope closes. 2020-06-05 18:52:35 -04:00
Peter Boyle
2b1e259441 Decode of SYCL devices fix 2020-06-04 17:16:55 -07:00
Peter Boyle
f39c2a240b Priintinig and device memory size detection 2020-06-04 14:58:03 -04:00
Peter Boyle
0d95805cde Print improvement 2020-06-03 22:50:32 -04:00
Peter Boyle
f67830587f Accelerator loop use 2020-06-03 22:50:09 -04:00
Peter Boyle
6bf7f839ff Better printing and logging 2020-06-03 09:28:57 -04:00
Peter Boyle
e3147881a9 Cache scheme 2020-06-03 09:23:48 -04:00
nmeyer-ur
9872c76825 introduce AddTimesI and SubTimesI; slight benefit in operators, but < 1%; breaks all other impls 2020-06-03 15:20:13 +02:00
Peter Boyle
fb559614ad Initialise meemory manager 2020-06-03 09:12:47 -04:00
Peter Boyle
e93e12b6a4 More verbose SYCL setup 2020-06-03 09:12:11 -04:00
Peter Boyle
0c3112cd94 Use view mechanism 2020-06-03 09:11:51 -04:00
Peter Boyle
8cfd5d2639 Need lattice view 2020-06-03 09:11:28 -04:00
Peter Boyle
1c9f20b15e Views must be closed 2020-06-03 09:10:29 -04:00
Peter Boyle
32237895bd Reorg memory manager for O(1) hash table 2020-06-03 09:09:52 -04:00
nmeyer-ur
5ee3ea2144 round-up after testing of prefetches in stencil close 2020-06-03 11:58:20 +02:00
Christoph Lehner
9fcb47ee63 Explicit error message instead of infinite loop in GlobalSharedMemory::GetShmDims 2020-06-02 07:44:38 -04:00
nmeyer-ur
5050833b42 revert changes due to performance penalty in Wilson using MPI 2020-06-02 13:08:57 +02:00
nmeyer-ur
7bee4ebb54 correct predication for svcadd 2020-06-02 10:51:39 +02:00
nmeyer-ur
71cf9851e7 correct type for vecd in TimesI and TimesMinusI 2020-06-02 10:44:15 +02:00
nmeyer-ur
b4735c9904 correct zero in svcadd 2020-06-02 10:38:05 +02:00
nmeyer-ur
9b2699226c use fcadd in TimesI and TimesMinusI instead of tbl and neg 2020-06-02 10:32:44 +02:00
nmeyer-ur
5f52804907 update calculation of data 2020-05-30 10:55:17 +02:00
nmeyer-ur
936071773e correct throughput in wilson and dwf 2020-05-29 22:15:59 +02:00
nmeyer-ur
1732f9319e more mods; counters seem to work correctly 2020-05-29 18:44:00 +02:00
nmeyer-ur
91c81cab30 some corrections; compiles on my laptop; untested 2020-05-29 18:19:22 +02:00
nmeyer-ur
38164f8480 include counters in WilsonFermionImplementation.h 2020-05-29 17:59:26 +02:00
nmeyer-ur
f013979791 add counter support in WilsonFermion.h 2020-05-29 17:13:59 +02:00
nmeyer-ur
e947b563ea add space in stencil output 2020-05-29 17:11:17 +02:00
nmeyer-ur
5cb3530c34 enable counters in Benchmark_wilson 2020-05-29 15:44:52 +02:00
nmeyer-ur
250008372f update SVE readme 2020-05-29 15:44:25 +02:00
Peter Boyle
1d252d0922 Accelerator inline 2020-05-28 11:45:25 -04:00
Peter Boyle
006cc8a8f1 Staggereed move to accelerator 2020-05-28 08:33:06 -04:00
nmeyer-ur
4fedd8d29f switch to MPI_THREAD_SERIALIZED instead of SINGLE 2020-05-27 14:08:34 +02:00
Peter Boyle
cf2938688a Sycl unhappy fix 2020-05-25 08:36:53 -07:00
Peter Boyle
ee63721bad int unhappiness sycl fix 2020-05-25 08:36:24 -07:00
Peter Boyle
22c5168d70 Sycl happier 2020-05-25 08:35:56 -07:00
Peter Boyle
949ac3cd24 Must avoid non-trivial copy constructors 2020-05-25 08:35:28 -07:00
Peter Boyle
7bc0166c1c SYCLL maknig happy - must avoid non ttrivial copy constructors 2020-05-25 08:34:19 -07:00
Peter Boyle
cb0d1b3399 hopefullly fix buildd fail 2020-05-24 21:27:00 -04:00
Peter Boyle
d1f1ccc705 HIP changes 2020-05-24 21:18:49 -04:00
Peter Boyle
c7519a237a Assertions fail on HIP foor unknown reasons - dedbugging 2020-05-24 14:02:47 -04:00
Peter Boyle
32be2b13d3 Updates for HiP 2020-05-24 14:00:55 -04:00
Peter Boyle
92b342a477 Hip reduction too 2020-05-24 13:50:28 -04:00
Peter Boyle
556da86ac3 HIP fp16 2020-05-24 13:41:58 -04:00
Peter Boyle
8285e41574 View location / access mode 2020-05-21 16:14:41 -04:00
Peter Boyle
f999408e92 View locatoin and access mode 2020-05-21 16:14:20 -04:00
Peter Boyle
a7abda89e2 View location & access mode 2020-05-21 16:13:59 -04:00
Peter Boyle
7860a50f70 Make view specify where and drive data motion - first cut.
This is a compile tiime option --enable-unified=yes/no
2020-05-21 16:13:16 -04:00
nmeyer-ur
6ddcef1bca fix build error enabling fcmla/mac in vector types for VLA 2020-05-21 21:21:03 +02:00
nmeyer-ur
8c5a5fdfce disable fcmla in vector type building for VLA 2020-05-21 19:41:42 +02:00
nmeyer-ur
046b1cbbc0 enable fcmla in tensor arithmetics; fixed-size works, VLA does not compile 2020-05-21 19:39:07 +02:00
nmeyer-ur
a65ce237c1 clean up; Exch1 VLA sp+dp integrate, tested, working 2020-05-21 09:48:06 +02:00
nmeyer-ur
cd27f1005d clean up; Exch1 sp integrate, tested, working 2020-05-21 08:45:43 +02:00
nmeyer-ur
f8c0a59221 clean up; Exch1 dp integrate, tested, working 2020-05-21 02:48:14 +02:00
nmeyer-ur
832485699f save some cycles in HtoD and DtoH by direct instead of multi-pass conversion 2020-05-20 23:04:35 +02:00
nmeyer-ur
81484a4760 symmetrize Mult and MultAddComplex 2020-05-20 22:36:45 +02:00
nmeyer-ur
9a86059761 symmetrize VLA and fixed size build messages 2020-05-20 20:05:42 +02:00
nmeyer-ur
b780b7b7a0 guard prevents multiple TOFU messages 2020-05-20 19:20:59 +02:00
nmeyer-ur
9e085bd04e guard prevents multiple A64FX build messages 2020-05-20 19:16:30 +02:00
ferben
6c6812a5ca GB/s output 2020-05-20 12:26:57 +01:00
Christoph Lehner
8358ee38c4 pull develop 2020-05-19 08:56:18 -04:00
ferben
1f154fe652 some cleanup in BaryonUtils 2020-05-19 13:48:56 +01:00
ferben
d708c0258d some cleanup in BaryonUtils 2020-05-19 13:48:00 +01:00
Christoph Lehner
a7635fd5ba summit mem 2020-05-18 17:52:26 -04:00
nmeyer-ur
6b6bf537d3 comment out mac in vector types 2020-05-18 20:36:16 +02:00
nmeyer-ur
323a651c71 correct typo 2020-05-18 19:58:27 +02:00
nmeyer-ur
9f212679f1 support fcmla in vector_types, untested 2020-05-18 19:55:18 +02:00
nmeyer-ur
032f7dde1a update SVE readme, asm generator 2020-05-18 19:10:36 +02:00
Peter Boyle
ebb60330c9 Automatic data motion options beginning 2020-05-17 16:34:25 -04:00
nmeyer-ur
50b1db1e8b implemented correct _m form (using 3 operands instead of 2) 2020-05-15 10:01:05 +02:00
nmeyer-ur
015d8bb38a introduced assertions in Benchmark_wilson, removed data output from Benchmark_dwf 2020-05-15 09:15:50 +02:00
nmeyer-ur
10a34312dc some fixed-size code clean up 2020-05-14 23:20:16 +02:00
nmeyer-ur
db8c0e7584 replaced _x form with _m form when using even/odd predication 2020-05-14 23:17:35 +02:00
Christoph Lehner
32fbdf4fb1 Merge pull request #5 from paboyle/develop
Sync upstream
2020-05-13 09:02:56 +02:00
Peter Boyle
a9847aa866 Dependence fix 2020-05-12 20:03:37 -04:00
nmeyer-ur
d15ccad8a7 switched to vec* in Reduce 2020-05-12 20:41:14 +02:00
nmeyer-ur
0009b5cee8 updated SVE_README 2020-05-12 19:02:33 +02:00
nmeyer-ur
20d1941a45 enabled asm kernels for fixed-size A64FXFIXEDSIZE 2020-05-12 19:01:12 +02:00
Peter Boyle
d24d8e8398 Use X-direction as more bits meaningful on CUDA.
2^31-1 shoulddd always bee enough for SIMD and thread reduced local volume

e.g. 32*2^31 = 2^36 = (2^9)^4 or 512^4 ias big enough.

Where 32 is gpu_threads * Nsimd = 8*4
2020-05-12 10:35:49 -04:00
Christoph Lehner
162e4bb567 no automatic prefetching for now 2020-05-12 07:01:23 -04:00
Peter Boyle
07c0c02f8c Speed up Cshift 2020-05-11 17:02:01 -04:00
Peter Boyle
8c31c065b5 Keep the Vector fixed to protect it from realloc 2020-05-11 17:00:30 -04:00
nmeyer-ur
b7c76ede29 Removed some assertions in Test_simd and removed exit() in Reduce 2020-05-11 22:43:00 +02:00
nmeyer-ur
05edf803bd corrected typo 2020-05-12 03:59:59 +09:00
Christoph Lehner
b1c86900b2 Merge pull request #4 from paboyle/develop
merge
2020-05-11 20:59:29 +02:00
nmeyer-ur
78b8e40f83 switched to gcc's internal data types 2020-05-11 18:11:23 +02:00
nmeyer-ur
fc2e9850d3 temporarily enable TOFU by default when using A64FX or A64FXFIXEDSIZE 2020-05-11 13:25:02 +02:00
nmeyer-ur
ffaaed679e MPI_THREAD_SINGLE hack for Fugaku, enabled by -DTOFU 2020-05-11 13:21:39 +02:00
Peter Boyle
bbbee5660d First compiile on HiP 2020-05-10 05:28:09 -04:00
nmeyer-ur
b2fd8b993a fixed-size clean up 2020-05-09 22:53:42 +02:00
nmeyer-ur
291ee8c3d0 updated fixed-size implementation; only Exch1 and prefetches missing 2020-05-09 22:18:02 +02:00
nmeyer-ur
e1a5b3ea49 unions for tables eliminate explicit loads, gcc does not complain 2020-05-09 21:21:57 +02:00
nmeyer-ur
55a55660cb reverted changes 2020-05-09 12:48:42 +02:00
Peter Boyle
52081acfa5 NVCC compile fixes 2020-05-08 13:14:12 -04:00
Peter Boyle
f8b8e00090 Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place.

Will move GpuInit iinto Accelerator.cc from Init.cc
Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2020-05-08 06:23:55 -07:00
nmeyer-ur
ceb8b374da API change v3 2020-05-08 15:04:44 +02:00
nmeyer-ur
4bc2ad2894 API change v2 2020-05-08 15:00:25 +02:00
nmeyer-ur
798af3e68f retry changing StoD API 2020-05-08 14:34:59 +02:00
nmeyer-ur
b0ef2367f3 testing alternate call to PrecisionChange 2020-05-08 14:22:44 +02:00
nmeyer-ur
71a7350a85 changed 2nd argument in Reduce to native vector type 2020-05-08 12:26:51 +02:00
nmeyer-ur
6f79369955 trying to get rid of macro definition error 2020-05-08 12:19:24 +02:00
nmeyer-ur
f9cb6b979f corrected more typos 2020-05-08 12:11:01 +02:00
nmeyer-ur
ed4d9d17f8 corrected type 2020-05-08 12:09:22 +02:00
nmeyer-ur
fbed02690d some changes in breaking out A64FX: use -DA64FXFIXEDSIZE for fixed size, but also define GEN 2020-05-08 12:05:31 +02:00
nmeyer-ur
39f3ae5b1d corrected more types 2020-05-08 11:07:14 +02:00
nmeyer-ur
e64bec8c8e pulled SVE typedefs out of Optimization 2020-05-08 11:04:21 +02:00
nmeyer-ur
0893b4e552 fixed typos in PrecisionChange 2020-05-08 10:59:07 +02:00
nmeyer-ur
92f0f29670 fixed double overloading vecf in Div, corrected typos 2020-05-08 10:57:23 +02:00
nmeyer-ur
48a340a9d1 GEN seems to defined by default -> some fixes applied 2020-05-08 10:47:49 +02:00
nmeyer-ur
f45621109b placed typedefs in Optimization 2020-05-08 10:41:52 +02:00
nmeyer-ur
32d1a0bbea added even more debug output 2020-05-08 10:39:26 +02:00
nmeyer-ur
267cce66a1 added more debug output 2020-05-08 10:29:28 +02:00
nmeyer-ur
3417147b11 added real fma, corrected typos in tbls; integrated, must supply A64FXGCC with GEN in configure 2020-05-08 10:20:19 +02:00
nmeyer-ur
b338719bc8 first transition to fixed-size done, excl. Exch; next step: integration 2020-05-07 22:33:28 +02:00
nmeyer-ur
2b81cbe2c2 first attempt to introduce tables using fixed-size; still incomplete 2020-05-07 22:01:19 +02:00
nmeyer-ur
acff9d6ed2 transition to fixed size data types almost done; still incomplete 2020-05-07 21:24:07 +02:00
nmeyer-ur
a306a49788 first mods for fixed size; still incomplete 2020-05-07 19:07:49 +02:00
nmeyer-ur
7ef03c5368 updated SVE readme 2020-05-06 16:30:37 +02:00
Peter Boyle
28a1fcaaff First compile against SYCL 2020-05-05 11:13:27 -07:00
nmeyer-ur
5abec5b8a9 SVE_readme update, update Grid_vector_types.h 2020-04-25 13:48:26 +02:00
nmeyer-ur
499edc0636 updated SVE_README.txt; defined ARMCLANGCOMPAT macro 2020-04-25 13:41:24 +02:00
nmeyer-ur
d990e61be3 armclang 20.1 settings in SVE readme 2020-04-25 12:11:43 +02:00
nmeyer-ur
3edb2dc2da removed -static from gcc CXXFLAGS 2020-04-24 13:04:34 +02:00
nils meyer
345721220e resolved merge conflict 2020-04-24 10:14:21 +02:00
nils meyer
6db68d6ecb added SVE configure for armclang and gcc 2020-04-24 10:10:47 +02:00
nmeyer-ur
09f0963d1f changes in configure.ac ; to be verified 2020-04-23 11:27:03 +02:00
nils meyer
6f44e3c192 reverted changes in configure.ac ; included SVE configure readme 2020-04-23 11:18:50 +02:00
nils meyer
5893888f87 removed default no-strict-aliasing for gcc-10.0.1 exclusively 2020-04-22 19:29:55 +02:00
nmeyer-ur
39b448affb Merge remote-tracking branch 'origin/develop' into feature/a64fx-2 2020-04-22 17:34:12 +02:00
nils meyer
e54a8f05a9 Exchange1 with generic version for now, should use svtbl2 in final version 2020-04-20 22:45:27 +02:00
nils meyer
64b72fc17f testing gcc 10.0.1: build errors in Exchange1 using -DA64FX and in Lattice_base.h building Dslash only 2020-04-19 01:25:40 +02:00
nils meyer
6fdce60492 revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0) 2020-04-16 22:43:32 +02:00
nils meyer
852db4626a re-introduced HOTFIX cause Grid binaries give wrong results otherwise; checked in good gridverter.py 2020-04-15 18:22:19 +02:00
nils meyer
6504a098cc 999 GiB/s Wilson; 694 GiB/s DW (DP) 2020-04-15 15:06:52 +02:00
nils meyer
79a385faca disabled armclang hotfix cause armclang 20.0 performance gets a little 2020-04-15 11:46:55 +02:00
nils meyer
c12a67030a 980 GiB/s Wilson; 680 GiB/s DW (DP) 2020-04-15 10:55:06 +02:00
nils meyer
581392f2f2 now with pf, best results so far using intrinsics+pf 2020-04-12 22:06:14 +02:00
nils meyer
113f277b6a enable dslash asm using -DA64FXASM, additionaly -DDSLASHINTRIN for intrinsics impl 2020-04-11 04:55:01 +02:00
nils meyer
974586bedc Dslash finally works; cleaned up; uses MOVPRFX in assembly 2020-04-10 22:26:40 +02:00
nmeyer-ur
160f78c1e4 changed debug output to variable direct 3 2020-04-10 12:23:07 +02:00
nmeyer-ur
7e4e1bbbc2 changed debug output to variable direct 2 2020-04-10 12:22:04 +02:00
nmeyer-ur
e699b7e9f9 changed debug output to variable direct 2020-04-10 12:18:30 +02:00
nmeyer-ur
a28bc0de90 debug register address test in WilsonHand 2020-04-10 12:07:45 +02:00
nmeyer-ur
14d0fe4d6c added predication in WilsonHand 2020-04-10 12:04:00 +02:00
nmeyer-ur
0ad2e0815c debug output in WilsonHand 2020-04-10 11:56:29 +02:00
nils meyer
1c8ca05e16 Merge branch 'feature/a64fx-2' of https://github.com/nmeyer-ur/Grid into feature/a64fx-2 2020-04-09 23:32:19 +02:00
nils meyer
dc9c8340bb switched to DSLASHINTRIN for A64FX Dslash intrinsics 2020-04-09 23:30:23 +02:00
nils meyer
19eef97503 specialized A64FX Dslash kernels 2020-04-09 23:25:25 +02:00
nmeyer-ur
635246ce50 corrected typo 2020-04-09 21:42:50 +02:00
nils meyer
5cdbb7e71e fixed A64FX Dslash; compiles, but does not specialize -> assertion 2020-04-09 21:23:39 +02:00
nmeyer-ur
8123590a1b changes 2020-04-09 16:45:47 +02:00
nmeyer-ur
86c9c4da8b changes 2020-04-09 16:40:06 +02:00
nmeyer-ur
cd1efee866 changes 2020-04-09 16:35:13 +02:00
nmeyer-ur
bd310932f7 changes 2020-04-09 16:32:31 +02:00
nmeyer-ur
304762e7ac changes 2020-04-09 16:26:01 +02:00
nmeyer-ur
d79ab03a6c changes 2020-04-09 16:19:25 +02:00
nmeyer-ur
d5708e0eb2 more changes 2020-04-09 15:43:34 +02:00
nmeyer-ur
123f6b7a61 more changes 2020-04-09 15:17:19 +02:00
nmeyer-ur
2b6457dd9a added xp/xm recon accum 2020-04-09 15:13:19 +02:00
nmeyer-ur
b367cbd422 defined ADD_RESULT 2020-04-09 15:08:45 +02:00
nmeyer-ur
e252c1aca3 addressing 2020-04-09 15:03:12 +02:00
nmeyer-ur
b140c6a4f9 addressing 2020-04-09 15:01:15 +02:00
nmeyer-ur
326de36467 revised sU addressing scheme 2020-04-09 14:44:25 +02:00
nmeyer-ur
9f224a1647 fixed typo in single 2020-04-09 14:30:21 +02:00
nmeyer-ur
bb46ba9b5f fixed array size in single 2020-04-09 14:28:45 +02:00
nmeyer-ur
dd5a22b36b revised declarations 2020-04-09 14:21:27 +02:00
nmeyer-ur
1ea85b9972 Disabled build message 2020-04-09 13:47:21 +02:00
nmeyer-ur
8fb63f1c25 added A64FX Wilson kernels single precision 2020-04-09 13:41:04 +02:00
nmeyer-ur
77fa586f6c introduced A64FX Wilson kernels 2020-04-09 13:30:06 +02:00
nmeyer-ur
15238e8d5e reduce acle works, clean up 2020-04-03 20:40:44 +02:00
nmeyer-ur
b27e31957a reduce acle revised 2020-04-03 19:46:15 +02:00
nmeyer-ur
46927771e3 reduce acle still needs overhaul 2020-04-03 19:30:48 +02:00
nmeyer-ur
d8cea77707 define simd width in header 2020-04-03 19:22:25 +02:00
nmeyer-ur
5f8a76d490 clean up, reduction in acle 2020-04-03 19:18:24 +02:00
nmeyer-ur
28d49a3b60 build problem resolved 2020-04-03 16:52:48 +02:00
nmeyer-ur
b4c624ece6 added A64FX support 2020-04-03 15:43:23 +02:00
2c22db841a Added momentum scaling to scalar HMC theories in order to follow UKQCD/CPS conventions 2020-04-02 17:38:47 +01:00
374 changed files with 24610 additions and 5408 deletions

View File

@@ -1,61 +0,0 @@
language: cpp
cache:
directories:
- clang
matrix:
include:
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@@ -37,7 +37,9 @@ directory
#endif #endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488 #pragma warning disable 488
#endif
#ifdef __NVCC__ #ifdef __NVCC__
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp

View File

@@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/Threads.h> #include <Grid/threads/ThreadReduction.h>
#include <Grid/serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>

View File

@@ -6,6 +6,7 @@
/////////////////// ///////////////////
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory>
#include <vector> #include <vector>
#include <array> #include <array>
#include <string> #include <string>
@@ -27,4 +28,7 @@
/////////////////// ///////////////////
#include "Config.h" #include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */ #endif /* GRID_STD_H */

View File

@@ -18,21 +18,28 @@
#pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__") #pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__") #pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__ #undef __NVCC__
#undef __CUDACC__ #undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__ #define __NVCC__REDEFINE__
#endif #endif
/* SYCL save and restore compile environment*/ /* SYCL save and restore compile environment*/
#ifdef __SYCL_DEVICE_ONLY__ #ifdef GRID_SYCL
#pragma push #pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__") #pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__ #undef __SYCL_DEVICE_ONLY__
#undef EIGEN_USE_SYCL
#define EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif #endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -51,6 +58,12 @@
#pragma pop #pragma pop
#endif #endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@@ -21,7 +21,8 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h extra_headers+=serialisation/Hdf5Type.h
endif endif
all: version-cache
all: version-cache Version.h
version-cache: version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@@ -42,7 +43,7 @@ version-cache:
fi;\ fi;\
rm -f vertmp rm -f vertmp
Version.h: Version.h: version-cache
cp version-cache Version.h cp version-cache Version.h
.PHONY: version-cache .PHONY: version-cache
@@ -53,6 +54,19 @@ Version.h:
include Make.inc include Make.inc
include Eigen.inc include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources) CCFILES += $(extra_sources)

View File

@@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h> #include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
@@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Forecast.h> #include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/approx/RemezGeneral.h> #include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h> #include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h> #include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h> #include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h> #include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h> #include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
@@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@@ -191,7 +189,7 @@ public:
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View(); autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -232,15 +230,18 @@ public:
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
{
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{ thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate cbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf); peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L; pokeLocalSite(s,p_v,cbuf);
pokeLocalSite(s,pgbuf,cbuf);
}); });
}
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
@@ -269,15 +270,19 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{ thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd); Coordinate clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result_v,clbuf);
}); });
}
result = result*div; result = result*div;
// destroying plan // destroying plan

View File

@@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
LinearCombTimer.Start(); LinearCombTimer.Start();
bo = beta * omega; bo = beta * omega;
auto p_v = p.View(); {
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
auto v_v = v.View(); autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
}); });
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
alpha = rho / Calpha.real(); alpha = rho / Calpha.real();
LinearCombTimer.Start(); LinearCombTimer.Start();
auto h_v = h.View(); {
auto psi_v = psi.View(); autoView( p_v , p, AcceleratorRead);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
autoView( psi_v,psi, AcceleratorRead);
autoView( h_v , h, AcceleratorWrite);
autoView( s_v , s, AcceleratorWrite);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
}); });
auto s_v = s.View();
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
}); });
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@@ -166,11 +172,17 @@ class BiCGSTAB : public OperatorFunction<Field>
omega = Comega.real() / norm2(t); omega = Comega.real() / norm2(t);
LinearCombTimer.Start(); LinearCombTimer.Start();
auto t_v = t.View(); {
autoView( psi_v,psi, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
autoView( h_v , h, AcceleratorRead);
autoView( s_v , s, AcceleratorRead);
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
}); });
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
cp = norm2(r); cp = norm2(r);

View File

@@ -140,13 +140,15 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
auto psi_v = psi.View(); {
auto p_v = p.View(); autoView( psi_v , psi, AcceleratorWrite);
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
}); });
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();

View File

@@ -0,0 +1,241 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,154 +0,0 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
#ifdef GRID_CUDA
int PointerCache::Ncache = 32;
#else
int PointerCache::Ncache = 8;
#endif
int PointerCache::Victim;
int PointerCache::VictimSmall;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
void PointerCache::Init(void)
{
char * str;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) Ncache = atoi(str);
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) NcacheSmall = atoi(str);
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
}
void *PointerCache::Insert(void *ptr,size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
return Insert(ptr,bytes,Entries,Ncache,Victim);
}
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Lookup(bytes,EntriesSmall,NcacheSmall);
return Lookup(bytes,Entries,Ncache);
}
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@@ -26,129 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H #pragma once
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
/* Could make these configurable, perhaps up to a max size*/
static const int NcacheSmallMax=128;
static const int NcacheMax=16;
static int NcacheSmall;
static int Ncache;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[NcacheMax];
static int Victim;
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
static int VictimSmall;
public:
static void Init(void);
static void *Insert(void *ptr,size_t bytes) ;
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes) ;
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#ifdef GRID_NVCC
#define profilerCudaMeminfo \
{ size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
#else
#define profilerCudaMeminfo
#endif
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
} \
profilerCudaMeminfo;
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class alignedAllocator { class alignedAllocator {
public: public:
@@ -172,89 +53,131 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type __n) { void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
MemoryManager::CpuFree((void *)__p,bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
} }
// FIXME: hack for the copy constructor, eventually it must be avoided // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; void construct(pointer __p, const _Tp& __val) { assert(0);};
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////
// Unified virtual memory
//////////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class uvmAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::SharedFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Device memory
////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class devAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
devAllocator() throw() { }
devAllocator(const devAllocator&) throw() { }
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
~devAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::AcceleratorFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>; #ifdef ACCELERATOR_CSHIFT
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Cshift on device
template<class T> using commVector = std::vector<T,alignedAllocator<T> >; template<class T> using cshiftAllocator = devAllocator<T>;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; #else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@@ -0,0 +1,254 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
}
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
}
void MemoryManager::InitMessage(void) {
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,180 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void PrintBytes(void);
public:
static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void Print(void);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,478 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
LRUinsert(AccCache);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,23 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,67 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@@ -81,6 +81,7 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask;
public: public:

View File

@@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -104,6 +105,7 @@ public:
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -114,6 +116,8 @@ public:
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions _fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];

View File

@@ -36,11 +36,27 @@ static const int CbBlack=1;
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;

View File

@@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@@ -108,6 +107,8 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Reduction // Reduction
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void GlobalMax(RealD &);
void GlobalMax(RealF &);
void GlobalSum(RealF &); void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N); void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &); void GlobalSum(RealD &);
@@ -138,21 +139,6 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,

View File

@@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
// other comms schemes are ok
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
#else
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0); assert(0);
@@ -267,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@@ -294,60 +312,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0); unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0); unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { // Enforce no UVM in comms, device or host OK
MPI_Request xrq; assert(acceleratorIsCommunicable(xmit));
MPI_Request rrq; assert(acceleratorIsCommunicable(recv));
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from, recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
assert(ierr==0); assert(ierr==0);
}
}
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -382,16 +368,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor); assert(from != _processor);
assert(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag;
if ( gfrom ==MPI_UNDEFINED) { if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq); tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( gdest == MPI_UNDEFINED ) { if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq); tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
@@ -403,15 +392,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@@ -422,6 +403,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
assert(ierr==0); assert(ierr==0);
list.resize(0); list.resize(0);
} }
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
//{
//}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
@@ -483,5 +471,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
CartesianCommunicator::~CartesianCommunicator(){} CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalMax(float &){}
void CartesianCommunicator::GlobalMax(double &){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
@@ -77,15 +79,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node // Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -96,20 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@@ -137,10 +116,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -150,13 +125,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall);
} }
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};

View File

@@ -102,7 +102,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes); static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -7,6 +7,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -29,9 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#ifdef GRID_NVCC #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif #endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: " #define header "SharedMemoryMpi: "
@@ -47,7 +51,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
#else
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
#endif
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
@@ -161,6 +170,23 @@ static inline int divides(int a,int b)
} }
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{ {
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now // Powers of 2,3,5 only in prime decomposition for now
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -170,17 +196,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
std::vector<int> primes({2,3,5}); std::vector<int> primes({2,3,5});
int dim = 0; int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1; int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) { while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) { int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p]; int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) { && divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime; AutoShmSize*=prime;
ShmDims[dim]*=prime; ShmDims[dim]*=prime;
last_dim = dim;
break; break;
} }
} }
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension; dim=(dim+1) %ndimension;
} }
} }
@@ -413,7 +446,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC #if defined(GRID_CUDA) ||defined(GRID_HIP)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@@ -433,27 +466,19 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ // if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; if ( 1 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
@@ -462,18 +487,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
#ifndef GRID_MPI3_SHM_NONE
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle; cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf); auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
@@ -490,17 +527,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf; void * thisBuf = ShmCommBuf;
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf; WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
} }
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
@@ -633,7 +684,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif #endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) { if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
@@ -677,15 +727,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
#ifdef GRID_NVCC #ifdef GRID_CUDA
cudaMemset(dest,0,bytes); cudaMemset(dest,0,bytes);
#else #else
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ {
#ifdef GRID_NVCC #ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else #else
bcopy(src,dest,bytes); bcopy(src,dest,bytes);
@@ -705,7 +755,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
#else
MPI_Comm_split(comm, rank, 0, &ShmComm);
#endif
MPI_Comm_rank(ShmComm ,&ShmRank); MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize); MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize); ShmCommBufs.resize(ShmSize);
@@ -735,22 +789,15 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT #ifdef GRID_SHM_FORCE_MPI
// Hide the shared memory path between sockets // Hide the shared memory path between ranks
// if even number of nodes {
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){ for(int r=0;r<size;r++){
int hisRank=ShmRanks[r]; if ( r!=rank ) {
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED; ShmRanks[r] = MPI_UNDEFINED;
} }
} }
} }
}
#endif #endif
SharedMemoryTest(); SharedMemoryTest();

View File

@@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryNone: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap // Hugetlbfs mapping intended, use anonymous mmap
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if 1
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
}; };
#endif
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality

View File

@@ -52,23 +52,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<typename Op, typename T1> template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift) auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2>
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2, class T3>
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{ {
return Cshift(closure(expr),dim,shift); return Cshift(closure(expr),dim,shift);
} }

View File

@@ -29,11 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
static Vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int bo = n*e2; int bo = n*e2;
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
} }
} }
} else { } else {
@@ -65,14 +67,26 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int o = n*stride; int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
thread_for(i,ent,{ {
buffer[table[i].first]=rhs_v[table[i].second]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@@ -95,43 +109,80 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
thread_for_collapse(2,n,e1,{ #ifdef ACCELERATOR_CSHIFT
for(int b=0;b<e2;b++){ autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}
}); });
} else { #else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
// Case of SIMD split AND checker dim cannot currently be hit, except in vobj temp =rhs_v[so+o+b];
// Test_cshift_red_black code. extract<vobj>(temp,pointers,offset);
std::cout << " Dense packed buffer WARNING " <<std::endl; });
thread_for_collapse(2,n,e1,{ #endif
for(int b=0;b<e2;b++){ } else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
Coordinate coor;
int o=n*n1; int o=n*n1;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
} }
}); });
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -145,7 +196,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent =0; int ent =0;
if ( cbmask ==0x3 ) { if ( cbmask ==0x3 ) {
@@ -154,7 +206,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@@ -165,16 +217,27 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
thread_for(i,ent,{ auto buffer_p = & buffer[0];
rhs_v[table[i].first]=buffer[table[i].second]; auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@@ -194,21 +257,33 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
auto rhs_v = rhs.View(); int _slice_stride = rhs.Grid()->_slice_stride[dimension];
thread_for_collapse(2,n,e1,{ int _slice_block = rhs.Grid()->_slice_block[dimension];
for(int b=0;b<e2;b++){ #ifdef ACCELERATOR_CSHIFT
int o = n*rhs.Grid()->_slice_stride[dimension]; autoView( rhs_v , rhs, AcceleratorWrite);
int offset = b+n*rhs.Grid()->_slice_block[dimension]; accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View(); assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
@@ -225,6 +300,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -239,14 +315,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} else { } else {
@@ -255,23 +333,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{ thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second]; lhs_v[table[i].first]=rhs_v[table[i].second];
}); });
#endif
}
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@@ -285,29 +372,41 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{ thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#endif
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////

View File

@@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -121,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size); static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
commVector<vobj> recv_buf(buffer_size); static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else { } else {
int words = send_buf.size(); int words = buffer_size;
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier(); grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf_extract[i][0], (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier(); grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
@@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -0,0 +1,4 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h> #include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>
@@ -35,7 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_local.h> #include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
//#include <Grid/lattice/Lattice_reality.h> #include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>

View File

@@ -42,9 +42,24 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#ifdef GRID_SIMT
// drop to scalar in SIMT; cleaner in fact
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, accelerator_inline vobj predicatedWhere(const iobj &predicate,
const robj &iffalse) { const vobj &iftrue,
const robj &iffalse)
{
Integer mask = TensorRemove(predicate);
typename std::remove_const<vobj>::type ret= iffalse;
if (mask) ret=iftrue;
return ret;
}
#else
template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate,
const vobj &iftrue,
const robj &iffalse)
{
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@@ -68,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
merge(ret, falsevals); merge(ret, falsevals);
return ret; return ret;
} }
#endif
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
@@ -81,26 +97,62 @@ struct getVectorType<Lattice<T> >{
//-- recursive evaluation of expressions; -- //-- recursive evaluation of expressions; --
// handle leaves of syntax tree // handle leaves of syntax tree
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template<class sobj> accelerator_inline template<class sobj,
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg) sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
{ {
return arg[ss]; return arg(ss);
}
////////////////////////////////////////////
//-- recursive evaluation of expressions; --
// whole vector return, used only for expression return type inference
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj vecEval(const uint64_t ss, const sobj &arg)
{
return arg;
} }
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg)
{ {
auto view = arg.AcceleratorView(ViewRead); return arg[ss];
return view[ss];
} }
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand // handle nodes in syntax tree- eval one operand
// vecEval needed (but never called as all expressions offloaded) to infer the return type
// in SIMT contexts of closure.
///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
{
return expr.op.func( vecEval(ss, expr.arg1) );
}
// vecEval two operands
template <typename Op, typename T1, typename T2> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
{
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
}
// vecEval three operands
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
{
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
}
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand coalesced
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline template <typename Op, typename T1> accelerator_inline
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr) auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
@@ -108,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
{ {
return expr.op.func( eval(ss, expr.arg1) ); return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
// eval two operands // eval two operands
///////////////////////
template <typename Op, typename T1, typename T2> accelerator_inline template <typename Op, typename T1, typename T2> accelerator_inline
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr) auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
{ {
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
} }
///////////////////////
// eval three operands // eval three operands
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) -> decltype(expr.op.func(eval(ss, expr.arg1),
eval(ss, expr.arg2),
eval(ss, expr.arg3)))
{ {
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); #ifdef GRID_SIMT
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
typedef typename std::remove_reference<rvobj>::type vobj;
const int Nsimd = vobj::vector_type::Nsimd();
auto vpred = vecEval(ss,expr.arg1);
ExtractBuffer<Integer> mask(Nsimd);
extract<vInteger, Integer>(TensorRemove(vpred), mask);
int s = acceleratorSIMTlane(Nsimd);
return expr.op.func(mask[s],
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#else
return expr.op.func(eval(ss, expr.arg1),
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#endif
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -180,16 +250,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
{
}
template <typename Op, typename T1> inline template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{ {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, expr.arg1); // recurse AST
} }
template <typename Op, typename T1, typename T2> inline template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{ {
@@ -204,32 +270,86 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
CBFromExpression(cb, expr.arg3); // recurse AST CBFromExpression(cb, expr.arg3); // recurse AST
} }
//////////////////////////////////////////////////////////////////////////
// ViewOpen
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // rrecurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
ExpressionViewOpen(expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewClose
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
ExpressionViewClose(expr.arg3); // recurse AST
}
//////////////////////////////////////////// ////////////////////////////////////////////
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \
struct name { \ struct name { \
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \ template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a)); GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryReal, real(a));
GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a)); GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a)); GridUnopClass(UnaryAsin, asin(a));
@@ -241,10 +361,10 @@ GridUnopClass(UnaryExp, exp(a));
// Binary operators // Binary operators
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \ struct name { \
template <class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const left &lhs, const right &rhs) \ func(const _left &lhs, const _right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@@ -264,10 +384,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
// Trinary conditional op // Trinary conditional op
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \ struct name { \
template <class _predicate,class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const predicate &pred, const left &lhs, const right &rhs) \ func(const _predicate &pred, const _left &lhs, const _right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@@ -275,17 +395,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
}; };
GridTrinOpClass(TrinaryWhere, GridTrinOpClass(TrinaryWhere,
(predicatedWhere<predicate, (predicatedWhere<
typename std::remove_reference<left>::type, typename std::remove_reference<_predicate>::type,
typename std::remove_reference<right>::type>(pred, lhs,rhs))); typename std::remove_reference<_left>::type,
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
//////////////////////////////////////////// ////////////////////////////////////////////
#define GRID_UNOP(name) name
#define GRID_UNOP(name) name<decltype(eval(0, arg))> #define GRID_BINOP(name) name
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_TRINOP(name) name
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -331,22 +451,17 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj); //GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj); //GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(real, UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing // abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt); GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin); GRID_DEF_UNOP(asin, UnaryAsin);
@@ -371,29 +486,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
{ {
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr); Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
{ {
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr); Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1), -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
eval(0, expr.arg2), vecEval(0, expr.arg2),
eval(0, expr.arg3)))> vecEval(0, expr.arg3)))>::type >
{ {
Lattice<decltype(expr.op.func(eval(0, expr.arg1), Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
eval(0, expr.arg2), vecEval(0, expr.arg2),
eval(0, expr.arg3)))> ret(expr); vecEval(0, expr.arg3)))>::type > ret(expr);
return ret; return ret;
} }
#define EXPRESSION_CLOSURE(function) \
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
auto function(Expression &expr) -> decltype(function(closure(expr))) \
{ \
return function(closure(expr)); \
}
#undef GRID_UNOP #undef GRID_UNOP
#undef GRID_BINOP #undef GRID_BINOP

View File

@@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -56,13 +56,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
auto tmp =ret_v(ss);
mac(&tmp,&lhs_t,&rhs_t); mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
@@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@@ -121,10 +121,10 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; auto tmp =ret_v(ss);
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs); mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -179,10 +179,10 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; auto tmp =ret_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t); mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.AcceleratorView(ViewRead); autoView( x_v , x, AcceleratorRead);
auto y_v = y.AcceleratorView(ViewRead); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.AcceleratorView(ViewRead); autoView( x_v , x, AcceleratorRead);
auto y_v = y.AcceleratorView(ViewRead); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#define STREAMING_STORES #define STREAMING_STORES
@@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum LatticeAcceleratorAdvise {
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
ViewRead = 0x1,
ViewWrite = 0x2,
ViewReadWrite = 0x3
};
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
accelerator_inline void Advise(int advise) {
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
if (advise & AdviseInfrequentUse) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
}
if (advise & AdviseReadMostly) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
}
#endif
#endif
};
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
int target;
cudaGetDevice(&target);
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
#endif
#endif
};
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
#endif
#endif
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics. // The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code // This contains extra (host resident) grid pointer data that may be accessed by host code
@@ -253,28 +80,23 @@ private:
} }
} }
public: public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
{ // and HostView for thread_for
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const LatticeView<vobj> View (ViewMode mode) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this)); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.AcceleratorPrefetch(mode);
return accessor;
}
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.HostPrefetch(mode);
return accessor; return accessor;
} }
@@ -298,11 +120,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite); auto exprCopy = expr;
accelerator_for(ss,me.size(),1,{ ExpressionViewOpen(exprCopy);
auto tmp = eval(ss,expr); auto me = View(AcceleratorWriteDiscard);
vstream(me[ss],tmp); accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@@ -317,11 +143,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite); auto exprCopy = expr;
accelerator_for(ss,me.size(),1,{ ExpressionViewOpen(exprCopy);
auto tmp = eval(ss,expr); auto me = View(AcceleratorWriteDiscard);
vstream(me[ss],tmp); accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@@ -335,11 +165,15 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite); auto exprCopy = expr;
accelerator_for(ss,me.size(),1,{ ExpressionViewOpen(exprCopy);
auto tmp = eval(ss,expr); auto me = View(AcceleratorWriteDiscard);
vstream(me[ss],tmp); accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
@@ -390,10 +224,11 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View(); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss]= r; me[ss]= r;
}); });
me.ViewClose();
return *this; return *this;
} }
@@ -403,11 +238,12 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode);
} }
// virtual ~Lattice(void) = default; // virtual ~Lattice(void) = default;
@@ -445,11 +281,12 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = AcceleratorView(ViewWrite); auto me = View(AcceleratorWriteDiscard);
auto him= r.AcceleratorView(ViewRead); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
@@ -459,11 +296,12 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = AcceleratorView(ViewWrite); auto me = View(AcceleratorWriteDiscard);
auto him= r.AcceleratorView(ViewRead); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
/////////////////////////////////////////// ///////////////////////////////////////////

View File

@@ -51,20 +51,23 @@ template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View()) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
Vector<View> basis_v(basis.size(),tmp_v); Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj; typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewReadWrite); basis_v.push_back(basis[k].View(AcceleratorWrite));
} }
#ifndef GRID_NVCC #if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region thread_region
{ {
std::vector < vobj > B(Nm); // Thread private vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{ thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.; for(int j=j0; j<j1; ++j) B[j]=0.;
@@ -79,6 +82,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
}); });
} }
#else #else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
return; return;
@@ -90,8 +95,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
Vector<double> Qt_jv(Nm*Nm); Vector<Coeff_t> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0]; Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
@@ -133,33 +138,40 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
}); });
} }
#endif #endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
template<class Field> template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].AcceleratorView()) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.AcceleratorView(ViewWrite);
Vector<View> basis_v(basis.size(),result_v); Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewRead); basis_v.push_back(basis[k].View(AcceleratorRead));
} }
vobj zz=Zero(); vobj zz=Zero();
Vector<double> Qt_jv(Nm); Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz); vobj zzz=Zero();
auto B=coalescedRead(zzz);
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]); B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
template<class Field> template<class Field>

View File

@@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
typedef iScalar<vInteger> vPredicate ; typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to lattice // compare lattice to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -78,9 +50,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
}); });
@@ -93,8 +65,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs); ret_v[ss]=op(lhs_v[ss],rhs);
}); });
@@ -107,8 +79,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]); ret_v[ss]=op(lhs,rhs_v[ss]);
}); });

View File

@@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), { thread_for( o, grid->oSites(), {
vector_type vI; vector_type vI;
Coordinate gcoor; Coordinate gcoor;
@@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
}); });
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
}); });
@@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
}); });
@@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr; typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{ accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer // FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop // Use vector [] operator and don't read coalesce this loop

View File

@@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto Y_v = Y.View(); autoView( Y_v , Y, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
@@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, CpuRead);
thread_region { thread_region {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);

View File

@@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
}); });
return ret; return ret;
@@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
}); });
return ret; return ret;
@@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
}); });
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
}); });
} }
@@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return; return;
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Must be CPU read view
template<class vobj,class sobj> template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid = l.Grid(); GridBase *grid = l.getGrid();
assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@@ -183,18 +182,27 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
return; return;
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuRead);
peekLocalSite(s,lv,site);
return;
};
GridBase *grid=l.Grid(); // Must be CPU write view
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -202,13 +210,19 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
return;
};
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuWrite);
pokeLocalSite(s,lv,site);
return; return;
}; };

View File

@@ -0,0 +1,79 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reality.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REAL_IMAG_H
#define GRID_LATTICE_REAL_IMAG_H
// FIXME .. this is the sector of the code
// I am most worried about the directions
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =real(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =imag(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto real(const Expression &expr) -> decltype(real(closure(expr)))
{
return real(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
{
return imag(closure(expr));
}
NAMESPACE_END(Grid);
#endif

View File

@@ -40,26 +40,77 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto lhs_v = lhs.View(); accelerator_for( ss, lhs_v.size(), 1, {
auto ret_v = ret.View(); ret_v[ss] = adj(lhs_v[ss]);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
return ret; return ret;
}; };
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });
return ret; return ret;
}; };
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h> #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
@@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
@@ -63,23 +92,69 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum; typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
/*
Threaded max, don't use for now
template<class Double>
inline Double max(const Double *arg, Integer osites)
{
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
std::vector<Double> maxarray(nthread);
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
Double max=arg[0];
for(int ss=myoff;ss<mywork+myoff; ss++){
if( arg[ss] > max ) max = arg[ss];
}
maxarray[thr]=max;
});
Double tmax=maxarray[0];
for(int i=0;i<nthread;i++){
if (maxarray[i]>tmax) tmax = maxarray[i];
}
return tmax;
}
*/
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
return sum_gpu(arg,osites); return sum_gpu(arg,osites);
#else #else
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
#endif #endif
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
auto arg_v = arg.View(); #if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites); auto ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@@ -92,6 +167,32 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm); return real(nrm);
} }
//The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{
typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
Lattice<vscalar> inner = localNorm2(arg);
auto grid = arg.Grid();
RealD max;
for(int l=0;l<grid->lSites();l++){
Coordinate coor;
scalar val;
RealD r;
grid->LocalIndexToLocalCoor(l,coor);
peekLocalSite(val,inner,coor);
r=real(TensorRemove(val));
if( (l==0) || (r>max)){
max=r;
}
}
grid->GlobalMax(max);
return max;
}
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -102,42 +203,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC // Might make all code paths go this way.
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance... // GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; accelerator_for( ss, sites, 1,{
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss]; auto x_l = left_v[ss];
auto y_l = right_v[ss]; auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l); inner_tmp_v[ss]=innerProductD(x_l,y_l);
}) });
nrm = TensorRemove(sum(inner_tmp_v,sites)); }
#endif
// This is in single precision and fails some tests
auto anrm = sum(inner_tmp_v,sites);
nrm = anrm;
return nrm; return nrm;
} }
@@ -175,40 +263,24 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.AcceleratorView(ViewRead);
auto y_v=y.AcceleratorView(ViewRead);
auto z_v=z.AcceleratorView(ViewWrite);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; autoView( x_v, x, AcceleratorRead);
Vector<inner_t> inner_tmp(sites); autoView( y_v, y, AcceleratorRead);
auto inner_tmp_v = &inner_tmp[0]; autoView( z_v, z, AcceleratorWrite);
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp); inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp; z_v[ss]=tmp;
}); });
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
@@ -224,47 +296,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
auto left_v=left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{
accelerator_for( ss, sites, nsimd,{ autoView(left_v,left, AcceleratorRead);
auto left_tmp = left_v(ss); autoView(right_v,right,AcceleratorRead);
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); accelerator_for( ss, sites, 1,{
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); auto left_tmp = left_v[ss];
}); inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp); norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
}); });
// Already promoted to double }
tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
#endif
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0]; ip = tmp[0];
nrm = real(tmp[1]); nrm = real(tmp[1]);
@@ -335,7 +389,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
auto Data_v=Data.View(); autoView( Data_v, Data, CpuRead);
thread_for( r,rd, { thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
@@ -413,8 +467,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View(); autoView( lhv, lhs, CpuRead);
auto rhv=rhs.View(); autoView( rhv, rhs, CpuRead);
thread_for( r,rd,{ thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@@ -521,14 +575,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
auto Rv=R.View(); autoView( Rv, R, CpuWrite);
auto Xv=X.View(); autoView( Xv, X, CpuRead);
auto Yv=Y.View(); autoView( Yv, Y, CpuRead);
thread_for_collapse(2, n, e1, { thread_for2d( n, e1, b,e2, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
}); });
} }
}; };
@@ -581,9 +633,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View(); autoView( X_v, X, CpuRead);
auto Y_v=Y.View(); autoView( Y_v, Y, CpuRead);
auto R_v=R.View(); autoView( R_v, R, CpuWrite);
thread_region thread_region
{ {
Vector<vobj> s_x(Nblock); Vector<vobj> s_x(Nblock);
@@ -628,13 +680,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog]; int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View(); autoView( R_v, R, CpuWrite);
auto X_v = X.View(); autoView( X_v, X, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@@ -692,8 +745,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v=rhs.View(); autoView( rhs_v, rhs, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);

View File

@@ -1,7 +1,14 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32 #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props;
#define WARP_SIZE 64
#endif
#ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#define WARP_SIZE 32
#endif
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@@ -19,7 +26,12 @@ template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device; int device;
#ifdef GRID_CUDA
cudaGetDevice(&device); cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize; Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
@@ -53,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
// cannot use overloaded operators for sobj as they are not volatile-qualified // cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp(); acceleratorSynchronise();
const Iterator VEC = WARP_SIZE; const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1); const Iterator vid = tid & (VEC-1);
@@ -67,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
beta += temp; beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
} }
__syncwarp(); acceleratorSynchronise();
} }
__syncthreads(); acceleratorSynchroniseAll();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
beta = Zero(); beta = Zero();
@@ -79,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
} }
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
} }
__syncthreads(); acceleratorSynchroniseAll();
} }
@@ -147,7 +159,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
sobj *smem = (sobj *)shmem_pointer; sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished // wait until all outstanding memory instructions in this thread are finished
__threadfence(); acceleratorFence();
if (tid==0) { if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x); unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@@ -156,7 +168,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
} }
// each thread must read the correct value of amLast // each thread must read the correct value of amLast
__syncthreads(); acceleratorSynchroniseAll();
if (amLast) { if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1] // reduce buffer[0], ..., buffer[gridDim.x-1]
@@ -199,13 +211,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize(); accelerator_barrier();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0]; auto result = buffer_v[0];
return result; return result;
} }

View File

@@ -375,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( ss, osites, { thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd); ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
@@ -462,7 +462,7 @@ public:
{ {
// Obtain one reseeded generator per thread // Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads(); int Nthread = 32; // Hardwire a good level or parallelism
std::vector<RngEngine> seeders(Nthread); std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){ for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);

View File

@@ -42,8 +42,8 @@ template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView(ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView(lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss))); coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
}); });
@@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}); });

View File

@@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{
half.Checkerboard() = cb; half.Checkerboard() = cb;
auto half_v = half.View(); autoView( half_v, half, CpuWrite);
auto full_v = full.View(); autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{ thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
Coordinate coor; Coordinate coor;
@@ -64,11 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
}); });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ {
int cb = half.Checkerboard(); int cb = half.Checkerboard();
auto half_v = half.View(); autoView( half_v , half, CpuRead);
auto full_v = full.View(); autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{ thread_for(ss,full.Grid()->oSites(),{
Coordinate coor; Coordinate coor;
@@ -96,15 +97,29 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
out = in; out = in;
} }
#ifdef __CUDA_ARCH__ template<typename T>
accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
out = in;
}
// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
/*template<typename T1, typename T2>
accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
convertType(T1 & out, const T2 & in) {
out = in;
}*/
#ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
} }
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
} }
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
} }
#endif #endif
@@ -116,18 +131,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
} }
template<typename T1,typename T2,int N> template<typename T1,typename T2>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in); accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
template<typename T1,typename T2,int N> convertType(out._internal,in._internal);
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in); }
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr> template<typename T1,typename T2>
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) { accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal); convertType(out,in._internal);
} }
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) { accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in); convertType(out._internal,in);
} }
@@ -144,16 +159,10 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
convertType(out._internal[i],in._internal[i]); convertType(out._internal[i],in._internal[i]);
} }
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) { accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
auto out_v = out.AcceleratorView(ViewWrite); autoView( out_v , out,AcceleratorWrite);
auto in_v = in.AcceleratorView(ViewRead); autoView( in_v , in ,AcceleratorRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{ accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss)); convertType(out_v[ss],in_v(ss));
}); });
@@ -164,19 +173,20 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>> -> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
{ {
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid()); Lattice<iScalar<t_inner>> ret(lhs.Grid());
auto ret_v = ret.AcceleratorView(ViewWrite);
{
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
}); });
}
return ret; return ret;
} }
@@ -194,9 +204,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<iScalar<CComplex>> ip(coarse); Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData; Lattice<vobj> fineDataRed = fineData;
// auto fineData_ = fineData.View(); autoView( coarseData_ , coarseData, AcceleratorWrite);
auto coarseData_ = coarseData.AcceleratorView(ViewWrite); autoView( ip_ , ip, AcceleratorWrite);
auto ip_ = ip.AcceleratorView(ViewReadWrite);
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine> blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
@@ -210,68 +219,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
} }
template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis)
{
typedef iVector<CComplex,nbasis > coarseSiteData;
coarseSiteData elide;
typedef decltype(coalescedRead(elide)) ScalarComplex;
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension;
// checks
assert( nbasis == Basis.size() );
subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){
conformable(Basis[i],fineData);
}
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
}
int blockVol = fine->oSites()/coarse->oSites();
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic
////////////////////////////////////////////////////////////////////////////////////////////////////////
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
auto sc=sci/nbasis;
auto i=sci%nbasis;
auto Basis_ = Basis[i].View();
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
int sf;
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
for(int sb=0;sb<blockVol;sb++){
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
}
coalescedWrite(coarseData_[sc](i),reduce);
});
return;
}
template<class vobj,class vobj2,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -298,10 +245,12 @@ template<class vobj,class vobj2,class CComplex>
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
auto fineZ_ = fineZ.AcceleratorView(ViewWrite); autoView( fineZ_ , fineZ, AcceleratorWrite);
auto fineX_ = fineX.AcceleratorView(ViewRead); autoView( fineX_ , fineX, AcceleratorRead);
auto fineY_ = fineY.AcceleratorView(ViewRead); autoView( fineY_ , fineY, AcceleratorRead);
auto coarseA_= coarseA.AcceleratorView(ViewRead); autoView( coarseA_, coarseA, AcceleratorRead);
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@@ -309,12 +258,12 @@ template<class vobj,class vobj2,class CComplex>
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
// z = A x + y // z = A x + y
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
typename vobj2::tensor_reduced::scalar_object cA; typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx; typename vobj::scalar_object cAx;
#else #else
@@ -344,15 +293,16 @@ template<class vobj,class CComplex>
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
// Precision promotion // Precision promotion
fine_inner = localInnerProductD(fineX,fineY); fine_inner = localInnerProductD<vobj>(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
{
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, { accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
}); });
}
} }
@@ -370,15 +320,16 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
{
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, { accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss]; CoarseInner_[ss] = coarse_inner_[ss];
}); });
} }
}
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
@@ -408,14 +359,19 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
} }
int blockVol = fine->oSites()/coarse->oSites(); int blockVol = fine->oSites()/coarse->oSites();
auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); // Turn this around to loop threaded over sc and interior loop
auto fineData_ = fineData.AcceleratorView(ViewRead); // over sf would thread better
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead);
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
coarseData_[sc]=Zero(); coarseData_[sc]=Zero();
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
@@ -425,7 +381,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions); Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
coarseData_[sc]=coarseData_[sc]+fineData_[sf]; coarseData_[sc]=coarseData_[sc]+fineData_[sf];
} }
@@ -510,8 +466,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
auto fineData_ = fineData.View(); autoView( fineData_ , fineData, AcceleratorWrite);
auto coarseData_ = coarseData.View(); autoView( coarseData_ , coarseData, AcceleratorRead);
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{ accelerator_for(sf,fine->oSites(),1,{
@@ -524,7 +480,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View(); /* auto basis_ = Basis[i], );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]); if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]); else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
} }
@@ -543,7 +499,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
fineData=Zero(); fineData=Zero();
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
auto ip_ = ip.AcceleratorView(ViewRead);
//Lattice<CComplex> cip(coarse);
//autoView( cip_ , cip, AcceleratorWrite);
//autoView( ip_ , ip, AcceleratorRead);
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
// coalescedWrite(cip_[sc], ip_(sc)());
// });
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
blockZAXPY(fineData,ip,Basis[i],fineData); blockZAXPY(fineData,ip,Basis[i],fineData);
} }
} }
@@ -571,15 +534,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead);
autoView(out_v,out,CpuWrite);
thread_for(idx, ig->lSites(),{ thread_for(idx, ig->lSites(),{
sobj s; sobj s;
ssobj ss; ssobj ss;
Coordinate lcoor(ni); Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor); peekLocalSite(s,in_v,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out,lcoor); pokeLocalSite(ss,out_v,lcoor);
}); });
} }
@@ -614,8 +579,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
auto t_v = To.AcceleratorView(ViewWrite);
auto f_v = From.AcceleratorView(ViewRead); autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);
@@ -638,8 +604,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
} }
// peekLocalSite(s,From,Fcoor);
// pokeLocalSite(s,To ,Tcoor);
} }
}); });
} }
@@ -670,6 +634,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -682,8 +648,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
}); });
} }
@@ -711,6 +677,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -723,8 +691,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
}); });
} }
@@ -752,6 +720,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -760,8 +730,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
} }
}); });
} }
@@ -789,6 +759,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -797,8 +769,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
} }
}); });
} }
@@ -862,7 +834,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
} }
//loop over outer index //loop over outer index
auto in_v = in.View(); autoView( in_v , in, CpuRead);
thread_for(in_oidx,in_grid->oSites(),{ thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@@ -955,7 +927,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(oidx, grid->oSites(),{ thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
@@ -1058,7 +1030,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites()); std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in); unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{ thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim); Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocoor, out_oidx);

View File

@@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}); });
@@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}); });

View File

@@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs, rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret, ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{ accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y); ret[ss]=pow(rhs[ss],y);
@@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y)); coalescedWrite(ret[ss],mod(rhs(ss),y));
@@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard(); ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y)); coalescedWrite(ret[ss],div(rhs(ss),y));
@@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

173
Grid/lattice/Lattice_view.h Normal file
View File

@@ -0,0 +1,173 @@
#pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
//public:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
ViewAdvise advise;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline ViewAdvise Advise(void) const { return advise; };
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
// Host only
GridBase * getGrid(void) const { return _grid; };
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
ViewMode mode;
void * cpu_ptr;
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
return coalescedRead(this->_odata[i]);
}
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
#if 1
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
#else
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
#endif
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
this->ViewOpen(mode);
}
// Host functions
void ViewOpen(ViewMode mode)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
this->cpu_ptr = (void *)this->_odata;
this->mode = mode;
this->_odata =(vobj *)
MemoryManager::ViewOpen(this->cpu_ptr,
this->_odata_size*sizeof(vobj),
mode,
this->advise);
}
void ViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
ViewCloser(View &_v) : v(_v) {};
~ViewCloser() { v.ViewClose(); }
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
NAMESPACE_END(Grid);

View File

@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
conformable(iftrue,predicate); conformable(iftrue,predicate);
conformable(iftrue,ret); conformable(iftrue,ret);
GridBase *grid=iftrue._grid; GridBase *grid=iftrue.Grid();
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@@ -52,22 +52,23 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
std::vector<Integer> mask(Nsimd); autoView(iftrue_v,iftrue,CpuRead);
std::vector<scalar_object> truevals (Nsimd); autoView(iffalse_v,iffalse,CpuRead);
std::vector<scalar_object> falsevals(Nsimd); autoView(predicate_v,predicate,CpuRead);
autoView(ret_v,ret,CpuWrite);
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){ Integer NN= grid->oSites();
thread_for(ss,NN,{
extract(iftrue._odata[ss] ,truevals); Integer mask;
extract(iffalse._odata[ss] ,falsevals); scalar_object trueval;
extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask); scalar_object falseval;
for(int l=0;l<Nsimd;l++){
for(int s=0;s<Nsimd;s++){ trueval =extractLane(l,iftrue_v[ss]);
if (mask[s]) falsevals[s]=truevals[s]; falseval=extractLane(l,iffalse_v[ss]);
} mask =extractLane(l,predicate_v[ss]);
if (mask) falseval=trueval;
merge(ret._odata[ss],falsevals); insertLane(l,ret_v[ss],falseval);
} }
});
} }
template<class vobj,class iobj> template<class vobj,class iobj>
@@ -76,9 +77,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
conformable(iftrue,iffalse); conformable(iftrue,iffalse);
conformable(iftrue,predicate); conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue._grid); Lattice<vobj> ret(iftrue.Grid());
where(ret,predicate,iftrue,iffalse); whereWolf(ret,predicate,iftrue,iffalse);
return ret; return ret;
} }

View File

@@ -130,6 +130,8 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
std::ios_base::fmtflags f(stream.flags());
stream << log.background()<< std::left; stream << log.background()<< std::left;
if (log.topWidth > 0) if (log.topWidth > 0)
{ {
@@ -152,6 +154,8 @@ public:
<< now << log.background() << " : " ; << now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
stream.flags(f);
return stream; return stream;
} else { } else {
return devnull; return devnull;

View File

@@ -1,3 +1,4 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1; int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry; static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@@ -264,7 +271,7 @@ class BinaryIO {
uint32_t &scidac_csumb) uint32_t &scidac_csumb)
{ {
grid->Barrier(); grid->Barrier();
GridStopWatch timer; GridStopWatch timer, insideTimer;
GridStopWatch bstimer; GridStopWatch bstimer;
nersc_csum=0; nersc_csum=0;
@@ -356,7 +363,10 @@ class BinaryIO {
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl; std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0); ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0); insideTimer.Start();
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);
insideTimer.Stop();
assert(ierr==0);
MPI_File_close(&fh); MPI_File_close(&fh);
MPI_Type_free(&fileArray); MPI_Type_free(&fileArray);
MPI_Type_free(&localArray); MPI_Type_free(&localArray);
@@ -431,7 +441,9 @@ class BinaryIO {
assert(ierr == 0); assert(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl; std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
insideTimer.Start();
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
insideTimer.Stop();
assert(ierr == 0); assert(ierr == 0);
MPI_Offset os; MPI_Offset os;
@@ -502,12 +514,20 @@ class BinaryIO {
timer.Stop(); timer.Stop();
} }
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write "; else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" " std::cout<< lastPerf.size <<"bytes in "<< timer.Elapsed() <<" "
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl; << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
std::cout << GridLogMessage << "IOobject: pure MPI IO call "
<< lastPerf.size <<" bytes in "
<< insideTimer.Elapsed() << " "
<< lastPerf.size/1024./1024./(insideTimer.useconds()/1.0e6)
<<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@@ -663,10 +683,15 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel_rng.SetState(tmp,lidx); Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
}); });
timer.Stop(); timer.Stop();
@@ -723,7 +748,12 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
parallel_rng.GetState(tmp,lidx); Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); });
timer.Stop(); timer.Stop();

View File

@@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
// Don't require scidac records EXCEPT checksum // Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present. // Use Grid MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class vsimd> template <class stats = PeriodicGaugeStatistics>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu.Grid(); GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<vLorentzColourMatrixD> GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj; typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
//////////////////////////////////////// ////////////////////////////////////////
@@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter {
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
stats Stats;
Stats(Umu,header);
std::string format = header.floating_point; std::string format = header.floating_point;
header.ensemble_id = description; header.ensemble_id = description;
header.ensemble_label = description; header.ensemble_label = description;
@@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader {
// Else use ILDG MetaData object if present. // Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present. // Else use SciDAC MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class vsimd> template <class stats = PeriodicGaugeStatistics>
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) { void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<vLorentzColourMatrixD > GaugeField;
typedef typename GaugeField::vector_object vobj; typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader {
if ( found_FieldMetaData || found_usqcdInfo ) { if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker; FieldMetaData checker;
GaugeStatistics(Umu,checker); stats Stats;
Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;

View File

@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
std::time_t t = std::time(nullptr); std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t); std::tm tm_ = *std::localtime(&t);
std::ostringstream oss; std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z"); oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str(); header.creation_date = oss.str();
header.archive_date = header.creation_date; header.archive_date = header.creation_date;
@@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
GridMetaData(grid,header); GridMetaData(grid,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header) template<class Impl>
class GaugeStatistics
{ {
// How to convert data precision etc... public:
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data); void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... header.link_trace=WilsonLoops<Impl>::linkTrace(data);
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data); header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
} }
};
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field.Grid();
@@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }

View File

@@ -40,6 +40,8 @@ using namespace Grid;
class NerscIO : public BinaryIO { class NerscIO : public BinaryIO {
public: public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
static inline void truncate(std::string file){ static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out); std::ofstream fout(file,std::ios::out);
} }
@@ -129,12 +131,12 @@ public:
// Now the meat: the object readers // Now the meat: the object readers
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd> template<class GaugeStats=PeriodicGaugeStatistics>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, static inline void readConfiguration(GaugeField &Umu,
FieldMetaData& header, FieldMetaData& header,
std::string file) std::string file,
GaugeStats GaugeStatisticsCalculator=GaugeStats())
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header); uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -153,23 +155,23 @@ public:
// munger is a function of <floating point, Real, data_type> // munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF> BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD> BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
@@ -177,7 +179,7 @@ public:
assert(0); assert(0);
} }
GaugeStatistics(Umu,clone); GaugeStats Stats; Stats(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl; <<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -203,15 +205,22 @@ public:
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
template<class vsimd> // Preferred interface
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
{
writeConfiguration(Umu,file,0,1,ens_label);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file, std::string file,
int two_row, int two_row,
int bits32) int bits32,
std::string ens_label = std::string("DWF"))
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef vLorentzColourMatrixD vobj;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
FieldMetaData header; FieldMetaData header;
@@ -219,8 +228,8 @@ public:
// Following should become arguments // Following should become arguments
/////////////////////////////////////////// ///////////////////////////////////////////
header.sequence_number = 1; header.sequence_number = 1;
header.ensemble_id = "UKQCD"; header.ensemble_id = std::string("UKQCD");
header.ensemble_label = "DWF"; header.ensemble_label = ens_label;
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
@@ -229,7 +238,7 @@ public:
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
GaugeStatistics(Umu,header); GaugeStats Stats; Stats(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
uint64_t offset; uint64_t offset;

View File

@@ -154,7 +154,7 @@ public:
grid->Barrier(); timer.Stop(); grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
GaugeStatistics(Umu, clone); PeriodicGaugeStatistics Stats; Stats(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette); RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@@ -208,7 +208,7 @@ public:
FieldMetaData clone(header); FieldMetaData clone(header);
GaugeStatistics(Umu, clone); PeriodicGaugeStatistics Stats; Stats(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette); RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_NVCC #ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else #else
@@ -112,7 +112,6 @@ class PerformanceCounter {
private: private:
typedef struct { typedef struct {
public:
uint32_t type; uint32_t type;
uint64_t config; uint64_t config;
const char *name; const char *name;

View File

@@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC #ifdef GRID_CUDA
#pragma pop #pragma pop
#endif #endif

View File

@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6; static constexpr int Zm = 6;
static constexpr int Tm = 7; static constexpr int Tm = 7;
static constexpr int Nc=3; static constexpr int Nc=Config_Nc;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
@@ -80,6 +80,13 @@ template<typename T> struct isSpinor {
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
// ChrisK very keen to add extra space for Gparity doubling. // ChrisK very keen to add extra space for Gparity doubling.
// //
// Also add domain wall index, in a way where Wilson operator // Also add domain wall index, in a way where Wilson operator

View File

@@ -41,7 +41,7 @@ class Action
public: public:
bool is_smeared = false; bool is_smeared = false;
// Heatbath? // Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name virtual std::string action_name() = 0; // return the action name

View File

@@ -115,18 +115,21 @@ public:
PokeIndex<LorentzIndex>(Uadj, U, mu); PokeIndex<LorentzIndex>(Uadj, U, mu);
} }
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { autoView(Umu_v,Umu,CpuRead);
autoView(Uadj_v,Uadj,CpuRead);
autoView(Uds_v,Uds,CpuWrite);
thread_for( lidx, GaugeGrid->lSites(), {
Coordinate lcoor; Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu, lcoor); peekLocalSite(ScalarUmu, Umu_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj, lcoor); peekLocalSite(ScalarUmu, Uadj_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds, lcoor); pokeLocalSite(ScalarUds, Uds_v, lcoor);
} });
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)

View File

@@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D); NAMESPACE_CHECK(Wilson5D);
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
NAMESPACE_CHECK(Staggered); NAMESPACE_CHECK(Staggered);
@@ -282,16 +283,14 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_NVCC
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
//////////////////// ////////////////////

View File

@@ -153,8 +153,8 @@ public:
typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::ImplParams ImplParams; \
typedef typename Impl::StencilImpl::View_type StencilView; \ typedef typename Impl::StencilImpl::View_type StencilView; \
typedef typename ViewMap<FermionField>::Type FermionFieldView; \ typedef const typename ViewMap<FermionField>::Type FermionFieldView; \
typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView; typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
#define INHERIT_IMPL_TYPES(Base) \ #define INHERIT_IMPL_TYPES(Base) \
INHERIT_GIMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \
@@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec // Single flavour one component spinors with colour index. 5d vec
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> // Deprecate Vec5d
NAMESPACE_CHECK(ImplStaggered5dVec); //#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
//NAMESPACE_CHECK(ImplStaggered5dVec);

View File

@@ -96,43 +96,31 @@ public:
int sl = St._simd_layout[direction]; int sl = St._simd_layout[direction];
Coordinate icoor; Coordinate icoor;
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd(); const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = SIMTlane(Nsimd); int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s); St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd; int mmu = mu % Nd;
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
//Decide whether we do a G-parity flavor twist
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
//It also assumes (but does not check) that abs(distance) == 1
int permute_lane = (sl==1) int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1)) || ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0)); || ((distance==-1)&&(icoor[direction]==0));
if ( permute_lane ) { permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
tmp(0) = chi(1);
tmp(1) = chi(0);
} else {
tmp(0) = chi(0);
tmp(1) = chi(1);
}
auto UU0=coalescedRead(U(0)(mu)); //Apply the links
auto UU1=coalescedRead(U(1)(mu)); int f_upper = permute_lane ? 1 : 0;
int f_lower = !f_upper;
mult(&phi(0),&UU0,&tmp(0)); mult(&phi(0),&UU0,&chi(f_upper));
mult(&phi(1),&UU1,&tmp(1)); mult(&phi(1),&UU1,&chi(f_lower));
} else {
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&chi(0));
mult(&phi(1),&UU1,&chi(1));
}
#else #else
typedef _Spinor vobj; typedef _Spinor vobj;
@@ -233,14 +221,16 @@ public:
Uconj = where(coor==neglink,-Uconj,Uconj); Uconj = where(coor==neglink,-Uconj,Uconj);
} }
auto U_v = U.View(); {
auto Uds_v = Uds.View(); autoView( U_v , U, CpuRead);
auto Uconj_v = Uconj.View(); autoView( Uconj_v , Uconj, CpuRead);
auto Utmp_v= Utmp.View(); autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,U_v,{ thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss]();
}); });
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
Uconj = adj(Cshift(Uconj,mu,-1)); Uconj = adj(Cshift(Uconj,mu,-1));
@@ -250,19 +240,25 @@ public:
Utmp = where(coor==0,Uconj,Utmp); Utmp = where(coor==0,Uconj,Utmp);
} }
{
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{ thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss](); Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
}); });
}
Utmp = Uconj; Utmp = Uconj;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp); Utmp = where(coor==0,U,Utmp);
} }
{
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{ thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss](); Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
}); });
}
} }
} }
@@ -272,11 +268,14 @@ public:
GaugeLinkField link(mat.Grid()); GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack. // use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View();
auto tmp_v = tmp.View(); {
autoView( link_v , link, CpuWrite);
autoView( tmp_v , tmp, CpuRead);
thread_foreach(ss,tmp_v,{ thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
}); });
}
PokeIndex<LorentzIndex>(mat, link, mu); PokeIndex<LorentzIndex>(mat, link, mu);
return; return;
} }
@@ -306,9 +305,10 @@ public:
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); {
auto Atilde_v = Atilde.View(); autoView( tmp_v , tmp, CpuWrite);
auto Btilde_v = Btilde.View(); autoView( Atilde_v , Atilde, CpuRead);
autoView( Btilde_v , Btilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{ thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss; int sF = s + Ls * ss;
@@ -316,6 +316,7 @@ public:
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
} }
}); });
}
PokeIndex<LorentzIndex>(mat, tmp, mu); PokeIndex<LorentzIndex>(mat, tmp, mu);
return; return;
} }

View File

@@ -208,7 +208,7 @@ public:
LebesgueOrder LebesgueEvenOdd; LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; // std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Conserved current utilities // Conserved current utilities

View File

@@ -85,7 +85,7 @@ class MADWF
maxiter =_maxiter; maxiter =_maxiter;
}; };
void operator() (const FermionFieldo &src4,FermionFieldo &sol5) void operator() (const FermionFieldo &src,FermionFieldo &sol5)
{ {
std::cout << GridLogMessage<< " ************************************************" << std::endl; std::cout << GridLogMessage<< " ************************************************" << std::endl;
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl; std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
@@ -114,8 +114,16 @@ class MADWF
/////////////////////////////////////// ///////////////////////////////////////
//Import source, include Dminus factors //Import source, include Dminus factors
/////////////////////////////////////// ///////////////////////////////////////
Mato.ImportPhysicalFermionSource(src4,b); GridBase *src_grid = src.Grid();
std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
if ( src_grid == Mato.GaugeGrid() ) {
Mato.ImportPhysicalFermionSource(src,b);
} else {
b=src;
}
std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl; std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
defect = b; defect = b;

View File

@@ -0,0 +1,194 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
#define GRID_QCD_NAIVE_STAG_FERMION_H
NAMESPACE_BEGIN(Grid);
class NaiveStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
};
template <class Impl>
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
void M(const FermionField &in, FermionField &out);
void Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
NaiveStaggeredFermion(GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_U );
DoubledGaugeField &GetU(void) { return Umu ; } ;
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &srct,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
NAMESPACE_END(Grid);
#endif

View File

@@ -72,19 +72,23 @@ public:
StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){}; StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
static accelerator_inline void multLink(SiteSpinor &phi, template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const SiteSpinor &chi, const _Spinor &chi,
int mu) int mu)
{ {
mult(&phi(), &U(mu), &chi()); auto UU = coalescedRead(U(mu));
mult(&phi(), &UU, &chi());
} }
static accelerator_inline void multLinkAdd(SiteSpinor &phi, template<class _Spinor>
static accelerator_inline void multLinkAdd(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const SiteSpinor &chi, const _Spinor &chi,
int mu) int mu)
{ {
mac(&phi(), &U(mu), &chi()); auto UU = coalescedRead(U(mu));
mac(&phi(), &UU, &chi());
} }
template <class ref> template <class ref>

View File

@@ -49,21 +49,35 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
public: public:
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
protected:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels // Generic Nc kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, template<int Naik>
static accelerator_inline
void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> static accelerator_inline
void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> static accelerator_inline
void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -71,15 +85,21 @@ public:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels // Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> static accelerator_inline
void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> static accelerator_inline
void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik> static accelerator_inline
void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
@@ -87,27 +107,11 @@ public:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels // Asm Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
public: public:

View File

@@ -113,20 +113,7 @@ public:
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
{ {
GridBase *GaugeGrid = U_ds.Grid(); assert(0);
thread_for(lidx, GaugeGrid->lSites(),{
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, U_ds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
});
} }
inline void DoubleStore(GridBase *GaugeGrid, inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &UUUds, // for Naik term

View File

@@ -245,7 +245,7 @@ public:
return out; return out;
} }
private: protected:
// here fixing the 4 dimensions, make it more general? // here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial RealD csw_r; // Clover coefficient - spatial
@@ -257,15 +257,16 @@ private:
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12 // eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices // using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F) CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{ {
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v,T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@@ -281,9 +282,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v, T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v, F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()();
@@ -299,9 +300,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v,T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()());
@@ -317,9 +318,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v , T, AcceleratorWrite);
auto F_v = F.View(); autoView( F_v , F, AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()());
@@ -335,9 +336,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v ,T,AcceleratorWrite);
auto F_v = F.View(); autoView( F_v ,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()());
@@ -354,9 +355,9 @@ private:
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v , T,AcceleratorWrite);
auto F_v = F.View(); autoView( F_v , F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()());

View File

@@ -61,7 +61,7 @@ public:
typedef typename SiteHalfSpinor::vector_type vComplexHigh; typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) { accelerator_inline int CommDatumSize(void) const {
return sizeof(SiteHalfCommSpinor); return sizeof(SiteHalfCommSpinor);
} }
@@ -69,7 +69,7 @@ public:
/* Compress includes precision change if mpi data is not same */ /* Compress includes precision change if mpi data is not same */
/*****************************************************/ /*****************************************************/
template<class _SiteHalfSpinor, class _SiteSpinor> template<class _SiteHalfSpinor, class _SiteSpinor>
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
_SiteHalfSpinor tmp; _SiteHalfSpinor tmp;
projector::Proj(tmp,in,mu,dag); projector::Proj(tmp,in,mu,dag);
vstream(buf[o],tmp); vstream(buf[o],tmp);
@@ -81,7 +81,7 @@ public:
accelerator_inline void Exchange(SiteHalfSpinor *mp, accelerator_inline void Exchange(SiteHalfSpinor *mp,
const SiteHalfSpinor * __restrict__ vp0, const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor * __restrict__ vp1, const SiteHalfSpinor * __restrict__ vp1,
Integer type,Integer o){ Integer type,Integer o) const {
SiteHalfSpinor tmp1; SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2; SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type); exchange(tmp1,tmp2,vp0[o],vp1[o],type);
@@ -93,7 +93,7 @@ public:
/* Have a decompression step if mpi data is not same */ /* Have a decompression step if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor * __restrict__ in, Integer o) { SiteHalfSpinor * __restrict__ in, Integer o) const {
assert(0); assert(0);
} }
@@ -103,7 +103,7 @@ public:
accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor * __restrict__ out1, SiteHalfSpinor * __restrict__ out1,
const SiteSpinor * __restrict__ in, const SiteSpinor * __restrict__ in,
Integer j,Integer k, Integer m,Integer type) Integer j,Integer k, Integer m,Integer type) const
{ {
SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp1, temp2;
SiteHalfSpinor temp3, temp4; SiteHalfSpinor temp3, temp4;
@@ -117,7 +117,7 @@ public:
/*****************************************************/ /*****************************************************/
/* Pass the info to the stencil */ /* Pass the info to the stencil */
/*****************************************************/ /*****************************************************/
accelerator_inline bool DecompressionStep(void) { return false; } accelerator_inline bool DecompressionStep(void) const { return false; }
}; };
@@ -142,7 +142,7 @@ public:
typedef typename SiteHalfSpinor::vector_type vComplexHigh; typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) { accelerator_inline int CommDatumSize(void) const {
return sizeof(SiteHalfCommSpinor); return sizeof(SiteHalfCommSpinor);
} }
@@ -150,7 +150,7 @@ public:
/* Compress includes precision change if mpi data is not same */ /* Compress includes precision change if mpi data is not same */
/*****************************************************/ /*****************************************************/
template<class _SiteHalfSpinor, class _SiteSpinor> template<class _SiteHalfSpinor, class _SiteSpinor>
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
_SiteHalfSpinor hsp; _SiteHalfSpinor hsp;
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
projector::Proj(hsp,in,mu,dag); projector::Proj(hsp,in,mu,dag);
@@ -163,7 +163,7 @@ public:
accelerator_inline void Exchange(SiteHalfSpinor *mp, accelerator_inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0, SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1, SiteHalfSpinor *vp1,
Integer type,Integer o){ Integer type,Integer o) const {
SiteHalfSpinor vt0,vt1; SiteHalfSpinor vt0,vt1;
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
@@ -175,7 +175,7 @@ public:
/*****************************************************/ /*****************************************************/
/* Have a decompression step if mpi data is not same */ /* Have a decompression step if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){ accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
} }
@@ -186,7 +186,7 @@ public:
accelerator_inline void CompressExchange(SiteHalfSpinor *out0, accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1, SiteHalfSpinor *out1,
const SiteSpinor *in, const SiteSpinor *in,
Integer j,Integer k, Integer m,Integer type){ Integer j,Integer k, Integer m,Integer type) const {
SiteHalfSpinor temp1, temp2,temp3,temp4; SiteHalfSpinor temp1, temp2,temp3,temp4;
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
@@ -200,7 +200,7 @@ public:
/*****************************************************/ /*****************************************************/
/* Pass the info to the stencil */ /* Pass the info to the stencil */
/*****************************************************/ /*****************************************************/
accelerator_inline bool DecompressionStep(void) { return true; } accelerator_inline bool DecompressionStep(void) const { return true; }
}; };

View File

@@ -74,6 +74,20 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
void Report(void);
void ZeroCounters(void);
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
@@ -196,5 +210,3 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -215,7 +215,7 @@ public:
LebesgueOrder LebesgueEvenOdd; LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; // std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
}; };

View File

@@ -72,7 +72,7 @@ public:
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor; typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams; typedef WilsonImplParams ImplParams;
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl; typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
typedef typename StencilImpl::View_type StencilView; typedef const typename StencilImpl::View_type StencilView;
ImplParams Params; ImplParams Params;
@@ -106,11 +106,15 @@ public:
const _SpinorField & phi, const _SpinorField & phi,
int mu) int mu)
{ {
auto out_v= out.View(); const int Nsimd = SiteHalfSpinor::Nsimd();
auto phi_v= phi.View(); autoView( out_v, out, AcceleratorWrite);
auto Umu_v= Umu.View(); autoView( phi_v, phi, AcceleratorRead);
thread_for(sss,out.Grid()->oSites(),{ autoView( Umu_v, Umu, AcceleratorRead);
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
coalescedWrite(out_v[sss],tmp);
}); });
} }
@@ -180,29 +184,57 @@ public:
mat = TraceIndex<SpinIndex>(P); mat = TraceIndex<SpinIndex>(P);
} }
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){ inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
{
for (int mu = 0; mu < Nd; mu++) for (int mu = 0; mu < Nd; mu++)
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu); mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){ {
#undef USE_OLD_INSERT_FORCE
int Ls=Btilde.Grid()->_fdimensions[0]; int Ls=Btilde.Grid()->_fdimensions[0];
autoView( mat_v , mat, AcceleratorWrite);
#ifdef USE_OLD_INSERT_FORCE
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); {
auto Btilde_v = Btilde.View(); const int Nsimd = SiteSpinor::Nsimd();
auto Atilde_v = Atilde.View(); autoView( tmp_v , tmp, AcceleratorWrite);
thread_for(sss,tmp.Grid()->oSites(),{ autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,tmp.Grid()->oSites(),1,{
int sU=sss; int sU=sss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sF = s+Ls*sU; int sF = s+Ls*sU;
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
} }
}); });
}
PokeIndex<LorentzIndex>(mat,tmp,mu); PokeIndex<LorentzIndex>(mat,tmp,mu);
#else
{
const int Nsimd = SiteSpinor::Nsimd();
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
auto op = outerProduct(bb,aa);
sum = sum + op;
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
#endif
} }
}; };

View File

@@ -49,9 +49,17 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base; typedef FermionOperator<Impl> Base;
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public: public:
#ifdef GRID_SYCL
#define SYCL_HACK
#endif
#ifdef SYCL_HACK
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
#endif
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ; int interior=1,int exterior=1) ;

View File

@@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_NVCC #ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
@@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type, Current curr_type,
unsigned int mu) unsigned int mu)
{ {
#ifndef GRID_NVCC #if (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = { Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX, Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY, Gamma::Algebra::GammaY,
@@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
LatticeInteger zz (UGrid); zz=0.0; PropagatorField zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for (int s=0;s<Ls;s++) { for (int s=0;s<Ls;s++) {
@@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
} }
#endif #endif
#ifndef GRID_NVCC #if (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0; int tshift = (mu == Nd-1) ? 1 : 0;
//////////////////////////////////////////////// ////////////////////////////////////////////////
// GENERAL CAYLEY CASE // GENERAL CAYLEY CASE
@@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
LatticeInteger zz (UGrid); zz=0.0; PropagatorField zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
@@ -880,11 +880,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
} }
std::vector<RealD> G_s(Ls,1.0); std::vector<RealD> G_s(Ls,1.0);
Integer sign = 1; // sign flip for vector/tadpole
if ( curr_type == Current::Axial ) { if ( curr_type == Current::Axial ) {
for(int s=0;s<Ls/2;s++){ for(int s=0;s<Ls/2;s++){
G_s[s] = -1.0; G_s[s] = -1.0;
} }
} }
else if ( curr_type == Current::Tadpole ) {
auto b=this->_b;
auto c=this->_c;
if ( b == 1 && c == 0 ) {
sign = -1;
}
else {
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
assert(b==1 && c==0);
}
}
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
@@ -907,7 +919,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = Cshift(tmp,mu,1); tmp = Cshift(tmp,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu); Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop tmp = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated

View File

@@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
{ {
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
int Ls=this->Ls; int Ls=this->Ls;
@@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
int Ls=this->Ls; int Ls=this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
auto plee = & lee [0]; auto plee = & lee [0];
auto pdee = & dee [0]; auto pdee = & dee [0];

View File

@@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi, psi_i,CpuRead);
auto phi = phi_i.View(); autoView(phi, phi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi, chi_i,CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd(); const int nsimd= Simd::Nsimd();
@@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi=psi_i.View(); autoView(psi,psi_i,CpuRead);
auto phi=phi_i.View(); autoView(phi,phi_i,CpuRead);
auto chi=chi_i.View(); autoView(chi,chi_i,CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd(); int nsimd= Simd::Nsimd();
@@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
Vector<iSinglet<Simd> > &Matm) Vector<iSinglet<Simd> > &Matm)
{ {
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
#ifndef AVX512 #ifndef AVX512
{ {
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
@@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
EnableIf<Impl::LsVectorised,int> sfinae=0; EnableIf<Impl::LsVectorised,int> sfinae=0;
#ifndef AVX512 #ifndef AVX512
{ {
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM; SiteHalfSpinor BcastM;
@@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
} }
#else #else
{ {
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
// pointers // pointers
// MASK_REGS; // MASK_REGS;
#define Chi_00 %zmm0 #define Chi_00 %zmm0

View File

@@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls; int Ls = this->Ls;
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto phi = phi_i.View(); autoView( phi , phi_i, AcceleratorRead);
auto psi = psi_i.View(); autoView( psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView( psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView( phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi=psi_i.View(); autoView( psi, psi_i, AcceleratorRead);
auto chi=chi_i.View(); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];
@@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi = psi_i.View(); autoView( psi, psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];

View File

@@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu, CpuRead);
auto UUUmu_v = UUUmu.View(); autoView( UUUmu_v , UUUmu, CpuRead);
auto in_v = in.View(); autoView( in_v , in, CpuRead);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for( ss,Umu.Grid()->oSites(),{ thread_for( ss,Umu.Grid()->oSites(),{
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
@@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
} }
@@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
@@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor // st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Remove explicit thread mapping introduced for OPA reasons.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime) DhopComputeTime-=usecond();
{ {
int tid = omp_get_thread_num(); int interior=1;
int nthreads = omp_get_num_threads(); int exterior=0;
int ncomms = CartesianCommunicator::nCommThreads; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U.Grid()->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
} }
DhopComputeTime+=usecond();
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
DhopComputeTime2-=usecond(); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
auto U_v = U.View(); DhopComputeTime2-=usecond();
auto UUU_v = UUU.View(); {
auto in_v = in.View(); int interior=0;
auto out_v = out.View(); int exterior=1;
if (dag == DaggerYes) { Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
});
} else {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
});
} }
DhopComputeTime2+=usecond(); DhopComputeTime2+=usecond();
#else
assert(0);
#endif
} }
template<class Impl> template<class Impl>
@@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond(); //double t1=usecond();
DhopTotalTime -= usecond(); DhopTotalTime -= usecond();
DhopCommTime -= usecond(); DhopCommTime -= usecond();
@@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View(); {
auto UUU_v = UUU.View(); int interior=1;
auto in_v = in.View(); int exterior=1;
auto out_v = out.View(); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
});
} else {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
});
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
} }
/*CHANGE END*/ /*CHANGE END*/

View File

@@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
//////////////////////// ////////////////////////
// Call the single hop // Call the single hop
//////////////////////// ////////////////////////
auto U_v = U.View(); autoView( U_v , U, CpuRead);
auto UUU_v = UUU.View(); autoView( UUU_v , UUU, CpuRead);
auto B_v = B.View(); autoView( B_v , B, CpuWrite);
auto Btilde_v = Btilde.View(); autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{ thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
}); });
@@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in, compressor); Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu, CpuRead);
auto UUUmu_v = UUUmu.View(); autoView( UUUmu_v , UUUmu, CpuRead);
auto in_v = in.View(); autoView( in_v , in, CpuRead);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for( sss, in.Grid()->oSites(),{ thread_for( sss, in.Grid()->oSites(),{
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
}); });
@@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
} }
template <class Impl> template <class Impl>
@@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
#ifdef GRID_OMP
Compressor compressor; Compressor compressor;
int len = U.Grid()->oSites(); int len = U.Grid()->oSites();
@@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopFaceTime -= usecond(); DhopFaceTime -= usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor); st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond(); DhopFaceTime+= usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Removed explicit thread comms
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
#pragma omp parallel
{ {
int tid = omp_get_thread_num(); int interior=1;
int nthreads = omp_get_num_threads(); int exterior=0;
int ncomms = CartesianCommunicator::nCommThreads; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
}
} else {
st.CommunicateThreaded();
}
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing // First to enter, last to leave timing
DhopFaceTime -= usecond(); DhopFaceTime -= usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
@@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond(); DhopComputeTime2 -= usecond();
{ {
auto U_v = U.View(); int interior=0;
auto UUU_v = UUU.View(); int exterior=1;
auto in_v = in.View(); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
auto out_v = out.View();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
} else {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
}
} }
DhopComputeTime2 += usecond(); DhopComputeTime2 += usecond();
#else
assert(0);
#endif
} }
@@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond(); DhopCommTime += usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
if (dag == DaggerYes) { {
thread_for(sss, in.Grid()->oSites(),{ int interior=1;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); int exterior=1;
}); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} else {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();

View File

@@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
auto plee = & this->lee [0]; auto plee = & this->lee [0];
@@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto pm = this->pm; auto pm = this->pm;

View File

@@ -0,0 +1,499 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
{
ImportGauge(_U);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
// https://arxiv.org/pdf/hep-lat/9712010.pdf
//
// mom space action.
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
//
// must track through staggered flavour/spin reduction in literature to
// turn to free propagator for the one component chi field, a la page 4/5
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
{
GaugeLinkField U(GaugeGrid());
DoubledGaugeField _UUU(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
// Discard teh Naik as Naive
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
}
CopyGaugeCheckerboards();
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out);
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField & mat,
const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor;
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
Atilde = A;
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////
// Call the single hop
////////////////////////
autoView( U_v , U, CpuRead);
autoView( B_v , B, CpuWrite);
autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
assert(0);// need to figure out the force interface with a blasted three link term.
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
mat.Checkerboard() = U.Checkerboard();
DerivInternal(Stencil, Umu, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Even);
assert(U.Checkerboard() == Odd);
mat.Checkerboard() = Odd;
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Odd);
assert(U.Checkerboard() == Even);
mat.Checkerboard() = Even;
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
{
DhopDir(in, out, dir, disp);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
{
assert(0); // Not implemented yet
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
{
Compressor compressor;
Stencil.HaloExchange(in, compressor);
autoView( Umu_v , Umu, CpuRead);
autoView( in_v , in, CpuRead);
autoView( out_v , out, CpuWrite);
// thread_for( sss, in.Grid()->oSites(),{
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
// });
assert(0);
};
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
NAMESPACE_END(Grid);

View File

@@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
assert(0); assert(0);
@@ -680,12 +680,14 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T ); gauge3 =(uint64_t)&UU[sU]( T );
#undef STAG_VEC5D
#ifdef STAG_VEC5D
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@@ -702,9 +704,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -736,10 +739,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
} }
#include <Grid/simd/Intel512double.h> #include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dag) int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@@ -756,8 +759,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHI(addr0,addr1,addr2,addr3);
@@ -787,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
#endif #endif
} }
#endif
#define PERMUTE_DIR3 __asm__ ( \ #define PERMUTE_DIR3 __asm__ ( \
@@ -821,10 +825,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@@ -841,9 +845,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
int sF=s+LLs*sU; {
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1); LOAD_CHIa(addr0,addr1);
@@ -890,10 +894,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
} }
#include <Grid/simd/Intel512double.h> #include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@@ -910,9 +914,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
int sF=s+LLs*sU; {
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1); LOAD_CHIa(addr0,addr1);

View File

@@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define LOAD_CHI(b) \ #ifdef GRID_SIMT
#define LOAD_CHI(ptype,b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane); \
Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane); \
Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
#define LOAD_CHI_COMMS(b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=coalescedRead(ref()()(0),lane); \
Chi_1=coalescedRead(ref()()(1),lane); \
Chi_2=coalescedRead(ref()()(2),lane);
#define PERMUTE_DIR(dir) ;
#else
#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b)
#define LOAD_CHI_COMMS(b) \
const SiteSpinor & ref (b[offset]); \ const SiteSpinor & ref (b[offset]); \
Chi_0=ref()()(0); \ Chi_0=ref()()(0); \
Chi_1=ref()()(1); \ Chi_1=ref()()(1); \
Chi_2=ref()()(2); Chi_2=ref()()(2);
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#endif
// To splat or not to splat depends on the implementation // To splat or not to splat depends on the implementation
#define MULT(A,UChi) \ #define MULT(A,UChi) \
auto & ref(U[sU](A)); \ auto & ref(U[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \ U_00=coalescedRead(ref()(0,0),lane); \
Impl::loadLinkElement(U_10,ref()(1,0)); \ U_10=coalescedRead(ref()(1,0),lane); \
Impl::loadLinkElement(U_20,ref()(2,0)); \ U_20=coalescedRead(ref()(2,0),lane); \
Impl::loadLinkElement(U_01,ref()(0,1)); \ U_01=coalescedRead(ref()(0,1),lane); \
Impl::loadLinkElement(U_11,ref()(1,1)); \ U_11=coalescedRead(ref()(1,1),lane); \
Impl::loadLinkElement(U_21,ref()(2,1)); \ U_21=coalescedRead(ref()(2,1),lane); \
Impl::loadLinkElement(U_02,ref()(0,2)); \ U_02=coalescedRead(ref()(0,2),lane); \
Impl::loadLinkElement(U_12,ref()(1,2)); \ U_12=coalescedRead(ref()(1,2),lane); \
Impl::loadLinkElement(U_22,ref()(2,2)); \ U_22=coalescedRead(ref()(2,2),lane); \
UChi ## _0 = U_00*Chi_0; \ UChi ## _0 = U_00*Chi_0; \
UChi ## _1 = U_10*Chi_0;\ UChi ## _1 = U_10*Chi_0;\
UChi ## _2 = U_20*Chi_0;\ UChi ## _2 = U_20*Chi_0;\
@@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);
#define MULT_ADD(U,A,UChi) \ #define MULT_ADD(U,A,UChi) \
auto & ref(U[sU](A)); \ auto & ref(U[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \ U_00=coalescedRead(ref()(0,0),lane); \
Impl::loadLinkElement(U_10,ref()(1,0)); \ U_10=coalescedRead(ref()(1,0),lane); \
Impl::loadLinkElement(U_20,ref()(2,0)); \ U_20=coalescedRead(ref()(2,0),lane); \
Impl::loadLinkElement(U_01,ref()(0,1)); \ U_01=coalescedRead(ref()(0,1),lane); \
Impl::loadLinkElement(U_11,ref()(1,1)); \ U_11=coalescedRead(ref()(1,1),lane); \
Impl::loadLinkElement(U_21,ref()(2,1)); \ U_21=coalescedRead(ref()(2,1),lane); \
Impl::loadLinkElement(U_02,ref()(0,2)); \ U_02=coalescedRead(ref()(0,2),lane); \
Impl::loadLinkElement(U_12,ref()(1,2)); \ U_12=coalescedRead(ref()(1,2),lane); \
Impl::loadLinkElement(U_22,ref()(2,2)); \ U_22=coalescedRead(ref()(2,2),lane); \
UChi ## _0 += U_00*Chi_0; \ UChi ## _0 += U_00*Chi_0; \
UChi ## _1 += U_10*Chi_0;\ UChi ## _1 += U_10*Chi_0;\
UChi ## _2 += U_20*Chi_0;\ UChi ## _2 += U_20*Chi_0;\
@@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
UChi ## _2 += U_22*Chi_2; UChi ## _2 += U_22*Chi_2;
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \ SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \ offset = SE->_offset; \
local = SE->_is_local; \ local = SE->_is_local; \
perm = SE->_permute; \ perm = SE->_permute; \
if ( local ) { \ if ( local ) { \
LOAD_CHI(in); \ LOAD_CHI(Perm,in); \
if ( perm) { \ if ( perm) { \
PERMUTE_DIR(Perm); \ PERMUTE_DIR(Perm); \
} \ } \
} else { \ } else { \
LOAD_CHI(buf); \ LOAD_CHI_COMMS(buf); \
} }
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
@@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
} }
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \ SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \ offset = SE->_offset; \
local = SE->_is_local; \ local = SE->_is_local; \
perm = SE->_permute; \ perm = SE->_permute; \
if ( local ) { \ if ( local ) { \
LOAD_CHI(in); \ LOAD_CHI(Perm,in); \
if ( perm) { \ if ( perm) { \
PERMUTE_DIR(Perm); \ PERMUTE_DIR(Perm); \
} \ } \
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
LOAD_CHI(buf); \ LOAD_CHI_COMMS(buf); \
} \ } \
if (local || st.same_node[Dir] ) { \ if (local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \ MULT_ADD(U,Dir,even); \
@@ -140,49 +158,59 @@ NAMESPACE_BEGIN(Grid);
local = SE->_is_local; \ local = SE->_is_local; \
if ((!local) && (!st.same_node[Dir]) ) { \ if ((!local) && (!st.same_node[Dir]) ) { \
nmu++; \ nmu++; \
{ LOAD_CHI(buf); } \ { LOAD_CHI_COMMS(buf); } \
{ MULT_ADD(U,Dir,even); } \ { MULT_ADD(U,Dir,even); } \
} }
#define HAND_DECLARATIONS(Simd) \
Simd even_0; \
Simd even_1; \
Simd even_2; \
Simd odd_0; \
Simd odd_1; \
Simd odd_2; \
\
Simd Chi_0; \
Simd Chi_1; \
Simd Chi_2; \
\
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21; \
Simd U_02; \
Simd U_12; \
Simd U_22;
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs const int Nsimd = SiteHalfSpinor::Nsimd();
Simd Chi_1; const int lane=acceleratorSIMTlane(Nsimd);
Simd Chi_2; typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
HAND_DECLARATIONS(Simt);
Simd U_00; // two rows of U matrix typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
Simd U_10; calcSiteSpinor result;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype; int offset,local,perm, ptype;
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
skew = 0; skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even); HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
@@ -193,6 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG (U,Ym,2,skew,odd); HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even); HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd); HAND_STENCIL_LEG (U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even); HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
@@ -202,7 +231,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even); HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
}
if ( dag ) { if ( dag ) {
result()()(0) = - even_0 - odd_0; result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1; result()()(1) = - even_1 - odd_1;
@@ -212,52 +241,39 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
result()()(1) = even_1 + odd_1; result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2; result()()(2) = even_2 + odd_2;
} }
vstream(out[sF],result); coalescedWrite(out[sF],result);
} }
} }
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc const int Nsimd = SiteHalfSpinor::Nsimd();
Simd even_1; const int lane=acceleratorSIMTlane(Nsimd);
Simd even_2; typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
Simd odd_0; // 12 regs on knc HAND_DECLARATIONS(Simt);
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
Simd Chi_1; calcSiteSpinor result;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset, ptype, local, perm; int offset, ptype, local, perm;
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); zeroit(even_0); zeroit(even_1); zeroit(even_2);
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
skew = 0; skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even); HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
@@ -268,6 +284,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even); HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd); HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
@@ -277,7 +294,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
}
// Assume every site must be connected to at least one interior point. No 1^4 subvols. // Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) { if ( dag ) {
result()()(0) = - even_0 - odd_0; result()()(0) = - even_0 - odd_0;
@@ -288,52 +305,39 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
result()()(1) = even_1 + odd_1; result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2; result()()(2) = even_2 + odd_2;
} }
vstream(out[sF],result); coalescedWrite(out[sF],result);
} }
} }
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc const int Nsimd = SiteHalfSpinor::Nsimd();
Simd even_1; const int lane=acceleratorSIMTlane(Nsimd);
Simd even_2; typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
Simd odd_0; // 12 regs on knc HAND_DECLARATIONS(Simt);
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
Simd Chi_1; calcSiteSpinor result;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset, ptype, local; int offset, ptype, local;
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); zeroit(even_0); zeroit(even_1); zeroit(even_2);
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
int nmu=0; int nmu=0;
skew = 0; skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
@@ -344,6 +348,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd); HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
@@ -353,7 +358,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
}
// Add sum of all exterior connected stencil legs // Add sum of all exterior connected stencil legs
if ( nmu ) { if ( nmu ) {
if ( dag ) { if ( dag ) {
@@ -365,11 +370,12 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
result()()(1) = even_1 + odd_1; result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2; result()()(2) = even_2 + odd_2;
} }
out[sF] = out[sF] + result; coalescedWrite(out[sF] , out(sF)+ result);
} }
} }
} }
/*
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@@ -385,8 +391,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \ SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \ const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI #undef LOAD_CHI
#undef HAND_DECLARATIONS
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \ SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \ if (SE->_is_local ) { \
if (SE->_permute) { \ int perm= SE->_permute; \
chi_p = &chi; \ chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
permute(chi, in[SE->_offset], ptype); \
} else { \ } else { \
chi_p = &in[SE->_offset]; \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
} else { \ acceleratorSynchronise(); \
chi_p = &buf[SE->_offset]; \ multLink(Uchi, U[sU], chi, Dir);
} \
multLink(Uchi, U[sU], *chi_p, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \ SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \ if (SE->_is_local ) { \
if (SE->_permute) { \ int perm= SE->_permute; \
chi_p = &chi; \ chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
if (SE->_is_local || st.same_node[Dir] ) { \ if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], *chi_p, Dir); \ multLink(Uchi, U[sU], chi, Dir); \
} }
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \ SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \ nmu++; \
chi_p = &buf[SE->_offset]; \ chi = coalescedRead(buf[SE->_offset],lane); \
multLink(Uchi, U[sU], *chi_p, Dir); \ multLink(Uchi, U[sU], chi, Dir); \
} }
template <class Impl> template <class Impl>
@@ -78,19 +71,25 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap // Int, Ext, Int+Ext cases for comms overlap
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag) { const FermionFieldView &in, FermionFieldView &out, int dag)
const SiteSpinor *chi_p; {
SiteSpinor chi; typedef decltype(coalescedRead(in[0])) calcSpinor;
SiteSpinor Uchi; calcSpinor chi;
calcSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int skew; int skew;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; //
// int sF=LLs*sU+s;
{
skew = 0; skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink); GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@@ -100,6 +99,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@@ -109,10 +109,11 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
vstream(out[sF], Uchi); coalescedWrite(out[sF], Uchi,lane);
} }
}; };
@@ -120,19 +121,24 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
// Only contributions from interior of our node // Only contributions from interior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) { const FermionFieldView &in, FermionFieldView &out,int dag)
const SiteSpinor *chi_p; {
SiteSpinor chi; typedef decltype(coalescedRead(in[0])) calcSpinor;
SiteSpinor Uchi; calcSpinor chi;
calcSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int skew ; int skew ;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; // int sF=LLs*sU+s;
{
skew = 0; skew = 0;
Uchi=Zero(); Uchi=Zero();
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@@ -143,6 +149,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -152,10 +159,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
vstream(out[sF], Uchi); coalescedWrite(out[sF], Uchi,lane);
} }
}; };
@@ -164,20 +172,25 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
// Only contributions from exterior of our node // Only contributions from exterior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, template <int Naik> accelerator_inline
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) { const FermionFieldView &in, FermionFieldView &out,int dag)
const SiteSpinor *chi_p; {
// SiteSpinor chi; typedef decltype(coalescedRead(in[0])) calcSpinor;
SiteSpinor Uchi; calcSpinor chi;
calcSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int nmu=0; int nmu=0;
int skew ; int skew ;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; // int sF=LLs*sU+s;
{
skew = 0; skew = 0;
Uchi=Zero(); Uchi=Zero();
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@@ -188,6 +201,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@@ -197,12 +211,13 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( nmu ) { if ( nmu ) {
auto _out = coalescedRead(out[sF],lane);
if ( dag ) { if ( dag ) {
out[sF] = out[sF] - Uchi; coalescedWrite(out[sF], _out-Uchi,lane);
} else { } else {
out[sF] = out[sF] + Uchi; coalescedWrite(out[sF], _out+Uchi,lane);
} }
} }
} }
@@ -211,72 +226,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel // Driving / wrapping routine to select right kernel
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
SiteSpinor *buf, int LLs, int sU, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=1;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out,
int dag,int interior,int exterior)
{
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
};
template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
{ {
// Disp should be either +1,-1,+3,-3 // Disp should be either +1,-1,+3,-3
// What about "dag" ? // What about "dag" ?
@@ -285,6 +237,112 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
assert(0); assert(0);
} }
#define KERNEL_CALLNB(A,improved) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
#define ASM_CALL(A) \
const uint64_t NN = Nsite*Ls; \
thread_for( ss, NN, { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
autoView( UUU_v , UUU, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
autoView( UUU_v , U, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -92,18 +92,16 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
int lvol = _Umu.Grid()->lSites(); int lvol = _Umu.Grid()->lSites();
int DimRep = Impl::Dimension; int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Coordinate lcoor;
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
for (int site = 0; site < lvol; site++)
{
grid->LocalIndexToLocalCoor(site, lcoor);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
peekLocalSite(Qx, CloverTerm, lcoor);
Qxinv = Zero();
//if (csw!=0){ //if (csw!=0){
for (int j = 0; j < Ns; j++) for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++) for (int k = 0; k < Ns; k++)
@@ -123,21 +121,22 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// } // }
pokeLocalSite(Qxinv, CloverTermInv, lcoor); pokeLocalSite(Qxinv, CTIv, lcoor);
});
} }
// Separate the even and odd parts // Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm))); pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm))); pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv))); pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv))); pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
} }
template <class Impl> template <class Impl>

View File

@@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh // FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){ for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd); Coordinate lcoor(Nd);
Tcomplex cc; Tcomplex cc;
// RealD sgn; // RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor); _grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha,lcoor); peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0); assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15); assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0); cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a,lcoor); pokeLocalSite(cc,a_v,lcoor);
}
} }
Wea = ( exp( a) * abs(W) ); Wea = ( exp( a) * abs(W) );
@@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh // FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){ for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd); Coordinate lcoor(Nd);
Tcomplex cc; Tcomplex cc;
// RealD sgn; // RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor); _grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha,lcoor); peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0); assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15); assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0); cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a,lcoor); pokeLocalSite(cc,a_v,lcoor);
} }}
Wea = ( exp( a) * abs(W) ); Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) ); Wema= ( exp(-a) * abs(W) );

View File

@@ -67,9 +67,99 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
diag_mass = 4.0 + mass; diag_mass = 4.0 + mass;
} }
int vol4;
vol4=Fgrid.oSites();
Stencil.BuildSurfaceList(1,vol4);
vol4=Hgrid.oSites();
StencilEven.BuildSurfaceList(1,vol4);
StencilOdd.BuildSurfaceList(1,vol4);
}
template<class Impl>
void WilsonFermion<Impl>::Report(void)
{
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
RealD volume = 1;
Coordinate latt = _grid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
} }
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
// how to count flops here?
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
// how to count flops here?
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion<Impl>::ZeroCounters(void) {
DhopCalls = 0; // ok
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0; // ok
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{ {
@@ -229,6 +319,7 @@ template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField &mat, const FermionField &A, GaugeField &mat, const FermionField &A,
const FermionField &B, int dag) { const FermionField &B, int dag) {
DerivCalls++;
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@@ -237,8 +328,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
FermionField Atilde(B.Grid()); FermionField Atilde(B.Grid());
Atilde = A; Atilde = A;
DerivCommTime-=usecond();
st.HaloExchange(B, compressor); st.HaloExchange(B, compressor);
DerivCommTime+=usecond();
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag // Flip gamma (1+g)<->(1-g) if dag
@@ -246,6 +340,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
int gamma = mu; int gamma = mu;
if (!dag) gamma += Nd; if (!dag) gamma += Nd;
DerivDhopComputeTime -= usecond();
int Ls=1; int Ls=1;
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@@ -253,7 +348,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
// spin trace outer product // spin trace outer product
////////////////////////////////////////////////// //////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu); Impl::InsertForce4D(mat, Btilde, Atilde, mu);
DerivDhopComputeTime += usecond();
} }
DerivComputeTime += usecond();
} }
template <class Impl> template <class Impl>
@@ -300,6 +397,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid()); conformable(in.Grid(), out.Grid());
@@ -311,6 +409,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@@ -323,6 +422,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@@ -387,13 +487,14 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
DhopTotalTime-=usecond();
#ifdef GRID_OMP #ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag); DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else else
#endif #endif
DhopInternalSerial(st,lo,U,in,out,dag); DhopInternalSerial(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
} }
template <class Impl> template <class Impl>
@@ -412,38 +513,53 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
///////////////////////////// /////////////////////////////
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.Prepare(); st.Prepare();
DhopFaceTime-=usecond();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor); st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond();
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
DhopComputeTime-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} else { } else {
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} }
DhopComputeTime+=usecond();
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond();
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior
///////////////////////////// /////////////////////////////
DhopComputeTime2-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} else { } else {
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} }
DhopComputeTime2+=usecond();
}; };
@@ -455,14 +571,18 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
{ {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
DhopCommTime-=usecond();
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime+=usecond();
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) { if (dag == DaggerYes) {
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} else { } else {
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} }
DhopComputeTime+=usecond();
}; };
/*Change ends */ /*Change ends */
@@ -483,32 +603,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
conformable(_grid, q_in_1.Grid()); conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid()); conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid()); conformable(_grid, q_out.Grid());
#if 0 assert(0);
PropagatorField tmp1(_grid), tmp2(_grid);
q_out = Zero();
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
// Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1);
auto tmp1_v = tmp1.View();
auto tmp2_v = tmp2.View();
auto q_in_1_v=q_in_1.View();
auto q_in_2_v=q_in_2.View();
auto q_out_v = q_out.View();
auto Umu_v = Umu.View();
thread_for(sU, Umu.Grid()->oSites(),{
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
q_in_2_v[sU],
q_out_v[sU],
Umu_v, sU, mu);
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
tmp2_v[sU],
q_out_v[sU],
Umu_v, sU, mu);
});
#else
#endif
} }
@@ -524,62 +619,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
{ {
conformable(_grid, q_in.Grid()); conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid()); conformable(_grid, q_out.Grid());
#if 0 assert(0);
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
Complex i(0.0,1.0);
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
q_out = Zero();
LatticeInteger coords(_grid);
LatticeCoordinate(coords, Tp);
// Need q(x + mu) and q(x - mu).
tmp = Cshift(q_in, mu, 1);
tmpFwd = tmp*lattice_cmplx;
tmp = lattice_cmplx*q_in;
tmpBwd = Cshift(tmp, mu, -1);
auto coords_v = coords.View();
auto tmpFwd_v = tmpFwd.View();
auto tmpBwd_v = tmpBwd.View();
auto Umu_v = Umu.View();
auto q_out_v = q_out.View();
thread_for(sU, Umu.Grid()->oSites(), {
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vPredicate t_mask;
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
Integer timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
q_out_v[sU],
Umu_v, sU, mu, t_mask);
}
// Repeat for backward direction.
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
(coords_v[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
q_out_v[sU],
Umu_v, sU, mu, t_mask);
}
});
#else
#endif
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -0,0 +1,450 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
//#if defined(A64FXASM)
#if defined(A64FX)
// safety include
#include <arm_sve.h>
// undefine everything related to kernels
#include <simd/Fujitsu_A64FX_undef.h>
///////////////////////////////////////////////////////////
// If we are A64FX specialise the single precision routine
///////////////////////////////////////////////////////////
#if defined(DSLASHINTRIN)
//#pragma message ("A64FX Dslash: intrin")
#include <simd/Fujitsu_A64FX_intrin_single.h>
#else
#pragma message ("A64FX Dslash: asm")
#include <simd/Fujitsu_A64FX_asm_single.h>
#endif
/// Switch off the 5d vectorised code optimisations
#undef DWFVEC5D
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// undefine
#include <simd/Fujitsu_A64FX_undef.h>
///////////////////////////////////////////////////////////
// If we are A64FX specialise the double precision routine
///////////////////////////////////////////////////////////
#if defined(DSLASHINTRIN)
#include <simd/Fujitsu_A64FX_intrin_double.h>
#else
#include <simd/Fujitsu_A64FX_asm_double.h>
#endif
// former KNL
//#define MAYBEPERM(A,perm) if (perm) { A ; }
//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, double
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, double
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
// undefs
#include <simd/Fujitsu_A64FX_undef.h>
#endif //A64FXASM

View File

@@ -0,0 +1,395 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: WilsonKernelsAsmBodyA64FX.h
Copyright (C) 2020
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// GCC 10 messes up SVE instruction scheduling using -O3, but
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
// performance now is better than armclang 20.2
#ifdef KERNEL_DAG
#define DIR0_PROJ XP_PROJ
#define DIR1_PROJ YP_PROJ
#define DIR2_PROJ ZP_PROJ
#define DIR3_PROJ TP_PROJ
#define DIR4_PROJ XM_PROJ
#define DIR5_PROJ YM_PROJ
#define DIR6_PROJ ZM_PROJ
#define DIR7_PROJ TM_PROJ
#define DIR0_RECON XP_RECON
#define DIR1_RECON YP_RECON_ACCUM
#define DIR2_RECON ZP_RECON_ACCUM
#define DIR3_RECON TP_RECON_ACCUM
#define DIR4_RECON XM_RECON_ACCUM
#define DIR5_RECON YM_RECON_ACCUM
#define DIR6_RECON ZM_RECON_ACCUM
#define DIR7_RECON TM_RECON_ACCUM
#else
#define DIR0_PROJ XM_PROJ
#define DIR1_PROJ YM_PROJ
#define DIR2_PROJ ZM_PROJ
#define DIR3_PROJ TM_PROJ
#define DIR4_PROJ XP_PROJ
#define DIR5_PROJ YP_PROJ
#define DIR6_PROJ ZP_PROJ
#define DIR7_PROJ TP_PROJ
#define DIR0_RECON XM_RECON
#define DIR1_RECON YM_RECON_ACCUM
#define DIR2_RECON ZM_RECON_ACCUM
#define DIR3_RECON TM_RECON_ACCUM
#define DIR4_RECON XP_RECON_ACCUM
#define DIR5_RECON YP_RECON_ACCUM
#define DIR6_RECON ZP_RECON_ACCUM
#define DIR7_RECON TP_RECON_ACCUM
#endif
//using namespace std;
#undef SHOW
//#define SHOW
#undef WHERE
#ifdef INTERIOR_AND_EXTERIOR
#define WHERE "INT_AND_EXT"
#endif
#ifdef INTERIOR
#define WHERE "INT"
#endif
#ifdef EXTERIOR
#define WHERE "EXT"
#endif
//#pragma message("here")
////////////////////////////////////////////////////////////////////////////////
// Comms then compute kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR_AND_EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
} else { \
LOAD_CHI(base); \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
PREFETCH_CHIMU_L2(basep); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
/*
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
though I expected that it would improve on performance
*/
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Pre comms kernel -- prefetch like normal because it is mostly right
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \
PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \
MULT_2SPIN_2; \
RECON; \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
PREFETCH_CHIMU_L2(basep); \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Post comms kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
MULT_2SPIN_2; \
RECON; \
nmu++; \
}
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
{ ZERO_PSI;} \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
MULT_2SPIN_2; \
RECON; \
nmu++; \
}
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif
{
int nmu;
int local,perm, ptype;
uint64_t base;
uint64_t basep;
const uint64_t plocal =(uint64_t) & in[0];
MASK_REGS;
int nmax=U.oSites();
for(int site=0;site<Ns;site++) {
#ifndef EXTERIOR
// int sU =lo.Reorder(ssU);
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
// int sUn=lo.Reorder(ssn);
int sUn=ssn;
#else
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
int sUn=ssn;
#endif
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
ssn=sUn*Ls+s;
int ent=ss*8;// 2*Ndim
int nent=ssn*8;
uint64_t delta_base, delta_base_p;
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
#ifdef SHOW
float rescale = 64. * 12.;
std::cout << "=================================================================" << std::endl;
std::cout << "ss = " << ss << " ssn = " << ssn << std::endl;
std::cout << "sU = " << sU << " ssU = " << ssU << std::endl;
std::cout << " " << std::endl;
std::cout << "Dir = " << Xp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
#ifdef SHOW
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
#ifdef SHOW
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
#ifdef SHOW
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
#ifdef SHOW
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
// { uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
std::cout << "base = " << (base - plocal)/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
#ifdef EXTERIOR
if (nmu==0) break;
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
#endif
base = (uint64_t) &out[ss];
basep= st.GetPFInfo(nent,plocal); ent++;
basep = (uint64_t) &out[ssn];
//PREFETCH_RESULT_L1_STORE(base);
RESULT(base,basep);
#ifdef SHOW
std::cout << "Dir = FINAL " << WHERE<< std::endl;;
base_ss = base;
std::cout << "base = " << (base - (uint64_t) &out[0])/rescale << std::endl;
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
std::cout << "----------------------------------------------------" << std::endl;
#endif
}
ssU++;
UNLOCK_GAUGE(0);
}
}
#undef DIR0_PROJ
#undef DIR1_PROJ
#undef DIR2_PROJ
#undef DIR3_PROJ
#undef DIR4_PROJ
#undef DIR5_PROJ
#undef DIR6_PROJ
#undef DIR7_PROJ
#undef DIR0_RECON
#undef DIR1_RECON
#undef DIR2_RECON
#undef DIR3_RECON
#undef DIR4_RECON
#undef DIR5_RECON
#undef DIR6_RECON
#undef DIR7_RECON
#undef ASM_LEG
#undef ASM_LEG_XP
#undef RESULT

View File

@@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
HAND_RESULT_EXT(ss,F) HAND_RESULT_EXT(ss,F)
#define HAND_SPECIALISE_GPARITY(IMPL) \ #define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
nmu = 0; \ nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
template<> void \ template<> accelerator_inline void \
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \

Some files were not shown because too many files have changed in this diff Show More