1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-22 17:52:02 +01:00

Compare commits

...

231 Commits

Author SHA1 Message Date
e7d9b75fdd Warning fixes 2022-08-31 19:01:14 -04:00
3d0e3ec363 Tracing 2022-08-31 18:31:46 -04:00
3c1c51f9aa Merge branch 'feature/dirichlet-gparity' into feature/dirichlet 2022-08-31 18:25:34 -04:00
8cc3c522c3 Merge pull request #409 from giltirn/feature/dirichlet-gparity-stage
Import round 5
2022-08-31 18:22:50 -04:00
5c87342108 Used in g-2 sign off 2022-08-31 17:35:32 -04:00
66177bfbe2 Used in g-2 sign off 2022-08-31 17:35:07 -04:00
5205e68963 RocTX, NVTX, text based self profiling 2022-08-31 17:34:09 -04:00
cd5cf6d614 Tracing replaces self timing hooks 2022-08-31 17:33:41 -04:00
5abb19eab0 Remove self timing 2022-08-31 17:32:49 -04:00
06d7b88c78 Force reporting improved 2022-08-31 17:32:21 -04:00
cf72799735 Better action naming 2022-08-31 17:24:11 -04:00
cdb8fcc269 Width=4 support. This is too broad; hit it on physical point run.
Need to change strategy, I think.
2022-08-31 17:21:33 -04:00
b4f4130901 Defer SMP node links until after interior. Allows for DMA overlapping
compute
2022-08-31 17:20:21 -04:00
bb049847d5 Tracing replaces self timing 2022-08-31 17:19:02 -04:00
fd33c835dd Feynman rule fix and tracing replaces self timing 2022-08-31 17:18:17 -04:00
21371a7e5b Tracing replaces self timing 2022-08-31 17:16:05 -04:00
abfaa00d3e Tracing replaces self timing 2022-08-31 17:15:24 -04:00
efee33c55d Tracing replaces self timing 2022-08-31 17:14:57 -04:00
db0fe6ddbb Tracing replaces self timinng 2022-08-31 17:14:14 -04:00
8a9e647120 Tracing replaces self timing 2022-08-31 17:13:44 -04:00
e6dcb821ad Tracing replaces self timing 2022-08-31 17:12:31 -04:00
9bff188f02 Tracing replaces self timing 2022-08-31 17:12:05 -04:00
111b30ca1d Tracing replaces self timing 2022-08-31 17:11:48 -04:00
24182ca8bf HIP allows conserved currents.
Tracing replaces self timeing
2022-08-31 17:11:18 -04:00
ee2d7369b3 Tracing replaces self timing 2022-08-31 17:10:45 -04:00
7c686d29c9 Tracing replaces self timing 2022-08-31 17:10:17 -04:00
e8a0a1e75d Tracing replaces self timing hooks 2022-08-31 17:09:47 -04:00
730be89abf Remove timing hooks as tracing replaces 2022-08-31 17:08:44 -04:00
f991ad7d5c Remove timing hooks as tracing replaces 2022-08-31 17:08:18 -04:00
b3f33f82f7 Decrease self timing hooks, use nvtx / roctx type tracing hooks instead 2022-08-31 17:06:47 -04:00
a34a6e059f Logging improvement. Sinitial will be used to improve RHMC terms 2022-08-31 17:06:08 -04:00
1333319941 Tracing 2022-08-31 17:00:25 -04:00
9295ed8d20 Print full memory range 2022-08-31 16:59:51 -04:00
19cc7653fb Tracing 2022-08-31 16:57:51 -04:00
5752538661 Tracing 2022-08-31 16:57:32 -04:00
ca40a1b00b Tracing 2022-08-31 16:54:55 -04:00
659fac9dfb Tracing hook 2022-08-31 16:54:25 -04:00
4dc3d6fce0 Buy into Nvidia/Rocm etc... tracing. 2022-08-31 16:53:19 -04:00
95b640cb6b 10TF/s on 32^3 x 64 on single node 2022-08-04 15:43:52 -04:00
2cb5bedc15 Copy stream HIP improvements 2022-08-04 15:24:03 -04:00
806b02bddf Simplify dead code 2022-08-04 15:23:13 -04:00
de40395773 More timing. Think I should start to use nvtx and rocmtx ?? 2022-08-04 13:37:16 -04:00
7ba4788715 Fix 2022-08-04 13:36:44 -04:00
06d9ce1a02 Synch ranks on node here for GPU - GPU memcopy 2022-08-04 13:35:56 -04:00
75bb6b2b40 Move barrier into the StencilSend begin routine 2022-08-04 13:35:26 -04:00
74f10c2dc0 Move barrier into Stencil Send 2022-08-04 13:34:11 -04:00
a93d5459d4 Better mpi request completion 2022-07-28 12:18:35 -04:00
9c21add0c6 High res timer replaces getttimeofday 2022-07-28 12:14:03 -04:00
639aab6563 High res timer instead of gettimeofday 2022-07-28 12:13:35 -04:00
8137cc7049 Allways concurrent comms 2022-07-28 12:01:51 -04:00
60e63dca1d Add memory logging channel 2022-07-28 11:39:15 -04:00
486409574e Expanded cach to avoid any allocs in HMC 2022-07-28 11:38:34 -04:00
a913b8be12 Dslash self timing. Might want to not have this 2022-07-28 11:37:55 -04:00
2239751850 Better logging 2022-07-28 11:37:36 -04:00
9b20f1449c Better timing 2022-07-28 11:37:12 -04:00
b99453083d Updated timing 2022-07-28 11:37:02 -04:00
943fbb914d Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-07-11 13:48:42 -04:00
ca4603580d Verbose 2022-07-11 13:48:35 -04:00
f73db8f1f3 Synch clocks 2022-07-11 13:47:39 -04:00
f7217d12d2 World barrier for clock synch 2022-07-11 13:45:31 -04:00
fab50c57d9 More loggin 2022-07-11 18:42:27 +01:00
3440534fbf MixedPrec support 2022-07-10 21:35:18 +01:00
177b1a7ec6 Mixed prec 2022-07-10 21:34:10 +01:00
58182fe345 Different approach to default dirichlet params 2022-07-10 21:32:58 +01:00
1f907d330d Different default params for dirichlet 2022-07-10 21:31:48 +01:00
b0fe664e9d Better force log info 2022-07-10 21:31:25 +01:00
c0f8482402 Remove SSC marks 2022-07-07 17:49:36 +01:00
3544965f54 Stream doesn't work 2022-07-07 17:49:20 +01:00
33e4a0caee Imported changes from feature/gparity_HMC branch:
Rework of WilsonFlow class
		Fixed logic error in smear method where the step index was initialized to 1 rather than 0, resulting in the logged output value of tau being too large by epsilon
		Previously smear_adaptive would maintain the current value of tau as a class member variable whereas smear would compute it separately; now both methods maintain the current value internally and it is updated by the evolve_step routines. Both evolve methods are now const.
		smear_adaptive now also maintains the current value of epsilon internally, allowing it to be a const method and also allowing the same class instance to be reused without needing to be reset
		Replaced the fixed evaluation of the plaquette energy density and plaquette topological charge during the smearing with a highly flexible general strategy where the user can add arbitrary measurements as functional objects that are evaluated at an arbitrary frequency
	        By default the same plaquette-based measurements are performed, but additional example functions are provided where the smearing is performed with different choices of measurement that are returned as an array for further processing
		Added a method to compute the energy density using the Cloverleaf approach which has smaller discretization errors
	Added a new tensor utility operation, copyLane, which allows for the copying of a single SIMD lane between two instances of the same tensor type but potentially different precisions
	To LocalCoherenceLanczos, added the option to compute the high/low eval of the fine operator on every restart to aid in tuning the Chebyshev
	Added Test_field_array_io which demonstrates and tests a single-file write of an arbitrary array of fields
	Added Test_evec_compression which generates evecs using Lanczos and attempts to compress them using the local coherence technique
	Added Test_compressed_lanczos_gparity which demonstrates the local coherence Lanczos for G-parity BCs
	Added HMC main programs for the 40ID and 48ID G-parity lattices
2022-07-01 14:12:12 -04:00
1f903d9296 Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-07-01 12:12:50 -04:00
4df1e0987f Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity 2022-07-01 09:55:43 -04:00
588c2f3cb1 Faster axpy_norm and innerProduct 2022-07-01 09:44:58 -04:00
bd99fd608c Introduce a non-default stream for compute operatoins 2022-07-01 09:42:53 -04:00
57b442d0de Log memory operations 2022-07-01 09:42:17 -04:00
751a4562d7 Timing improvement 2022-07-01 09:41:43 -04:00
ca66301dee Remove debug 2022-06-30 14:53:12 -04:00
808bb59206 Mixed prec DD-RHMC 2022-06-30 13:50:09 -04:00
4b7f51d19d Create a new RNG file 2022-06-30 13:49:50 -04:00
d03152fac4 New file under debug 2022-06-30 13:49:35 -04:00
137f190258 Dirichlet implementation 2022-06-30 13:45:07 -04:00
53d01312b3 Rough flop counting, need to add M5D, M5Ddag, MooeeInv flops 2022-06-30 13:44:09 -04:00
220050822a Speed up M5D and M5Ddag 2022-06-30 13:43:27 -04:00
87ad76d81b Initialise timeval 2022-06-30 13:42:46 -04:00
4ac1094856 Updated config commands 2022-06-27 12:16:24 -04:00
d44a57b0af Allow frequency=0 to disable 2022-06-27 12:15:55 -04:00
dc000d10ee Spelling correction 2022-06-27 12:14:57 -04:00
3685f391cf More verbose CG 2022-06-27 12:11:08 -04:00
efd7338a00 Allow dirichlet at round the world link 2022-06-27 12:10:27 -04:00
e1e7b1e224 RNG fix 2022-06-27 12:09:52 -04:00
7319d4e1ad Merge pull request #407 from giltirn/feature/dirichlet-gparity-stage
Import round 4
2022-06-22 15:23:36 -04:00
fd933420c6 Imported changes from feature/gparity_HMC branch:
Added a bounds-check function for the RHMC with arbitrary power
	Added a pseudofermion action for the rational ratio with an arbitrary power and a mixed-precision variant of the same. The existing one-flavor rational ratio class now uses the general class under the hood
	To support testing of the two-flavor even-odd ratio pseudofermion, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field
	Added a new HMC runner start type: CheckpointStartReseed, which reseeds the RNG from scratch, allowing for the creation of new evolution streams from an existing checkpoint. Added log output of seeds used when the RNG is seeded.
	EOFA changes:
		To support mixed-precision inversion, generalized the class to maintain a separate solver for the L and R operators in the heatbath (separate solvers are already implemented for the other stages)
		To support mixed-precision, the action of setting the operator shift coefficients is now maintained in a virtual function. A derived class for mixed-precision solvers ensures the coefficients are applied to both the double and single-prec operators
		The ||^2 of the random source is now stored by the heatbath and compared to the initial action when it is computed. These should be equal but may differ if the rational bounds are not chosen correctly, hence serving as a useful and free test
		Fixed calculation of M_eofa (previously incomplete and #if'd out)
		Added functionality to compute M_eofa^-1 to complement the calculation of M_eofa (both are equally expensive!)
		To support testing, separated the functionality of generating the random field and performing the heatbath step, and added a method to obtain the pseudofermion field
	Added a test program which computes the G-parity force using the 1 and 2 flavor implementations and compares the result. Test supports DWF, EOFA and DSDR actions, chosen by a command line option.
	The Mobius EOFA force test now also checks the rational approximation used for the heatbath
	Added a test program for the mixed precision EOFA compared to the double-prec implementation,
	G-parity HMC test now applied GPBC in the y direction and not the t direction (GPBC in t are no longer supported) and checkpoints after every configuration
	Added a test program which computes the two-flavor G-parity action (via RHMC) with both the 1 and 2 flavor implementations and checks they agree
	Added a test program to check the implementation of M_eofa^{-1}
2022-06-22 10:27:48 -04:00
8208a6214f Merge branch 'feature/dirichlet-gparity' into feature/dirichlet 2022-06-15 19:23:48 -04:00
3d8146b596 Merge branch 'feature/dirichlet-gparity' of https://github.com/paboyle/Grid into feature/dirichlet-gparity 2022-06-15 19:20:27 -04:00
31efa5c4da Script updates for current summit 2022-06-15 19:19:44 -04:00
d10d30dda8 Script update 2022-06-15 19:18:58 -04:00
0e9666bc92 Test update 2022-06-15 19:18:42 -04:00
6efd80f104 Printing 2022-06-15 18:23:46 -04:00
fdef7a1a8c Dirichlet fix 2022-06-15 00:05:20 -04:00
501bb117bf Const correct 2022-06-15 00:04:09 -04:00
05ca7dc252 Const correctness 2022-06-14 23:41:05 -04:00
e9648a1635 Useful periodic print. CG convergence bound is remarkably accurate on
low eigenvalue in numerical tests
2022-06-14 23:40:04 -04:00
9a9f4a111f Merge pull request #405 from giltirn/feature/dirichlet-gparity-stage
Import round 3
2022-06-06 18:45:37 -04:00
1ad54d049d To PeriodicBC and ConjugateBC, added a new function "CshiftLink" which performs a boundary-aware C-shift of links or products of links. For the latter, the links crossing the global boundary are complex-conjugated.
To the gauge implementations, added CshiftLink functions calling into the appropriate operation for the BC in a given direction.
GaugeTransform, FourierAcceleratedGaugeFixer and WilsonLoops::FieldStrength no longer implicitly assume periodic boundary conditions; instead the shifted link is obtained using CshiftLink and is aware of the gauge implementation.
Added an assert-check to ensure that the gauge fixing converges within the specified number of steps.
Added functionality to compute the timeslice averaged plaquette
Added functionality to compute the 5LI topological charge and timeslice topological charge
Added a check of the properties of the charge conjugation matrix C=-gamma_2 gamma_4 to Test_gamma
Fixed const correctness for Replicate
Modified Test_fft_gfix to support either conjugate or periodic BCs, optionally disabling Fourier-accelerated gauge fixing, and tuning of alpha using cmdline options
2022-06-02 15:30:41 -04:00
57bd0a0a22 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-06-01 19:29:38 -04:00
b49db84b08 Slurm updates 2022-06-01 19:27:42 -04:00
583f7c52f3 SSC mark 2022-06-01 19:27:29 -04:00
58a86c9164 SSC mark removal 2022-06-01 19:27:06 -04:00
a25b32847f Crusher patch 2022-06-01 19:26:37 -04:00
6f1a2e132b SSC mark causing problems 2022-06-01 19:26:06 -04:00
b1ede7b46d Faster RNG init 2022-06-01 19:25:42 -04:00
e762c940c2 Reduce the loop over exterior for GPU to indirection table 2022-06-01 14:29:25 -07:00
6a1a198144 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-29 11:08:09 -04:00
34faa39f4f Clean up Dirichlet. Big oops fix 2022-05-28 17:18:08 -07:00
5ddea3829d Extra easier signature for peek 2022-05-28 15:52:39 -07:00
7eb29cf529 MPI fix 2022-05-28 15:51:34 -07:00
f729b9b889 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-25 14:16:09 -04:00
4f997c5f04 Remove extra face kernels in Dirichlet 2022-05-25 11:15:25 -07:00
d3496d2fe0 Merge pull request #397 from giltirn/feature/dirichlet-gparity-stage
Gparity HMC import round 2
2022-05-25 13:29:45 -04:00
60f4cb0ffd Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-05-25 12:38:10 -04:00
136d843ce7 Crusher updates 2022-05-25 12:36:09 -04:00
18028f4309 Merge branch 'develop' into feature/dirichlet 2022-05-24 18:26:18 -07:00
5164016740 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-05-24 18:25:57 -07:00
d83beaa890 Update perlmutter 2022-05-24 18:25:00 -07:00
f9f05e995b Update perlmutter 2022-05-24 18:24:38 -07:00
e651b9e7ab Clean up stencil with better intranode Dirichlet / DDHMC support.
14TF/s on a Perlmutter node
2022-05-24 18:23:39 -07:00
47b4e91473 Verbose change 2022-05-24 18:19:18 -07:00
3f31afa4fc Clean up verbose 2022-05-24 18:18:51 -07:00
af3b065add Merge pull request #403 from fjosw/fix/cuda_11_5_warnings
Fixed nvcc 11.5+ warnings
2022-05-24 11:10:02 -04:00
7937ac2bab fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in pugixml/pugixml.cc 2022-05-24 15:31:03 +01:00
e909aeedf0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT standard in Grid_Eigen_Dense.h 2022-05-24 15:29:42 +01:00
bab8aa8eb0 fix: conditional pragmas according to new NVCC_DIAG_PRAGMA_SUPPORT
standard in DisableWarnings.h
2022-05-24 15:27:40 +01:00
38b22f05be Merge pull request #402 from fjosw/fix/clover_warnings
fixed clover warnings
2022-05-24 10:05:27 -04:00
617c5362c1 fix: fixed warning: missing return statement at end of non-void function
in CloverHelpers
2022-05-24 11:37:33 +01:00
083b58e66d Merge pull request #401 from JPRichings/LocalCoheranceDeflation
Local coherance batch deflation
2022-05-20 11:44:22 -04:00
633427a2df Merge pull request #400 from JPRichings/wilson_sweep
bench wilson sweep fix
2022-05-20 11:43:40 -04:00
2031d6910a Merge branch 'paboyle:develop' into wilson_sweep 2022-05-20 16:20:23 +01:00
f82ce67624 Dirichlet improved 2022-05-19 19:17:11 -07:00
b52e8ef65a Dirichlet changes 2022-05-19 16:45:41 -07:00
2594e3c230 Dirichlet option 2022-05-19 16:45:19 -07:00
8cedb45af2 Dirichlet BCs 2022-05-19 16:45:02 -07:00
aa008cbe99 Updated for new Dirichlet interface 2022-05-19 16:44:39 -07:00
79e34b3eb4 Local Coherence batch deflation 2022-05-19 14:53:17 +01:00
4f3d581ab4 Merge branch 'paboyle:develop' into LocalCoheranceDeflation 2022-05-19 14:46:17 +01:00
6fb6ca5b6b Merge branch 'develop' into feature/dirichlet 2022-05-17 09:09:00 -07:00
b8ee19691c Updated config for PM 2022-05-17 09:08:12 -07:00
d16427b837 Merge pull request #399 from fjosw/fix/Nc_neq_3
fix: assert for dimensions of compact Wilson clover moved to constructor
2022-05-17 09:03:42 -04:00
4b1997e2f3 wilson sweep test 2022-05-16 15:58:33 +01:00
8939d5dc73 bugfix: eo operator called in correct location 2022-05-16 00:28:28 +01:00
b051e00de0 Additional Local Coherance Deflation operator() 2022-05-16 00:25:13 +01:00
8aa75b492f Merge branch 'develop' into fix/Nc_neq_3 2022-05-10 14:22:03 +01:00
0274f40686 Merge pull request #389 from mbruno46/mbruno-eclover
Feature/expClover
2022-05-10 09:18:19 -04:00
77aa147ce5 Merge branch 'develop' into mbruno-eclover 2022-05-10 09:16:53 -04:00
32facbd02a fix: assert for dimensions of compact Wilson clover moved to
constructor.
2022-05-10 10:53:22 +01:00
6121397587 Imported changes from feature/gparity_HMC branch:
Added storage of final true residual in mixed-prec CG and enhanced log output
	Fixed const correctness of multi-shift constructor
	Added a mixed precision variant of the multi-shift algorithm that uses a single precision operator and applies periodic reliable update to the residual
	Added tests/solver/Test_dwf_multishift_mixedprec to test the above
	Fixed local coherence lanczos using the (large!) max approx to the chebyshev eval as the scale from which to judge the quality of convergence, resulting a test that always passes
	Added a method to local coherence lanczos class that returns the fine eval/evec pair
	Added iterative log output to power method
	Added optional disabling of the plaquette check in Nerscio to support loading old G-parity configs which have a factor of 2 error in the plaquette
	G-parity Dirac op no longer allows GPBC in the time direction; instead we toggle between periodic and antiperiodic
	Replaced thread_for G-parity 5D force insertion implementation with accelerator_for version capable of running on GPUs
	Generalized tests/lanczos/Test_dwf_lanczos to support regular DWF as well as Gparity, with the action chosen by a command line option
	Modified tests/forces/Test_dwf_gpforce,Test_gpdwf_force,Test_gpwilson_force to use GPBC a spatial direction rather than the t-direction, and antiperiodic BCs for time direction
	tests/core/Test_gparity now supports using APBC in time direction using command line toggle
2022-05-09 16:27:57 -04:00
4de50ab146 Merge pull request #396 from fjosw/fix/readd_config.h
fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am
2022-05-09 08:26:48 -04:00
8b12a61097 fix: readded Config.h and Version.h to HFILEs in Grid/Makefile.am 2022-05-09 11:53:22 +01:00
79ea027c0b Merge pull request #377 from RJHudspith/develop
NERSC and ILDG for non-SU(3) configuration checkpoints
2022-05-03 08:55:48 -04:00
62339d437f Merge pull request #387 from lehner/feature/gpt
Parity mass terms for domain wall fermions to enable 4d eofa
2022-05-03 08:52:18 -04:00
698e745276 Merge pull request #390 from fjosw/feature/conserved_current_wilson
Conserved current for wilson fermions
2022-05-03 08:51:10 -04:00
0417b96896 Merge pull request #391 from giltirn/feature/dirichlet-gparity-stage
First stage of import
2022-05-03 08:50:18 -04:00
9a6e2c315d Merge pull request #394 from fjosw/fix/gauge_fix_ErrorOnNoConverge
SteepestDescentGaugeFix now exits when the algorithm does not converge.
2022-05-03 08:49:26 -04:00
e61fed87db SteepestDescentGaugeFix now exits when the algorithm does not converge.
This behaviour can be altered by setting err_on_no_converge to false.
2022-04-20 15:41:55 +01:00
81fe4c937e Hopefully fix link errors on Intel compilers due to having no function body for MomentumFilterBase::apply_phase 2022-04-12 09:51:59 -04:00
f77f3a6598 Imported G-parity flavor algebra + tester from feature/gparity_HMC branch 2022-04-06 10:21:04 -04:00
239afb18fb Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-04-05 16:49:32 -04:00
ef820a26cd Bcopy on crusher compile 2022-04-05 16:49:02 -04:00
65abe4d0d3 Merge branch 'feature/dirichlet' into feature/dirichlet-gparity 2022-04-05 16:26:54 -04:00
5012adfebf Merge branch 'develop' into feature/dirichlet 2022-04-05 16:26:19 -04:00
b808d48fa1 Tone down printing in integrator 2022-04-05 16:25:22 -04:00
83f818a99d Updates for DDHMC 2022-04-05 16:24:34 -04:00
b8bc560b51 Test_wilson_conserved_current implemented, all 5d references removed. 2022-04-05 17:33:45 +01:00
6bc2483d57 Merge branch 'feature/eclover' into feature/conserved_current_wilson 2022-04-05 15:26:49 +01:00
82aecbf4cf Test_wilson_conserved_current added 2022-04-05 15:26:39 +01:00
ee23a76aa0 Merge pull request #2 from fjosw/feature/eclover
Feature/eclover
2022-04-05 13:30:13 +02:00
d7191e5a02 SeqConservedCurrent implemented for Wilson fermions 2022-04-05 11:48:56 +01:00
c8a824425b Error message added if another conserved current than vector is requested for
Wilson type fermions.
2022-04-05 10:58:22 +01:00
f23626a6b8 End scope by additional block in CloverHelpers.h 2022-04-02 16:08:15 +01:00
6577a03d16 Explcitly closed views in Exponentiate_Clover 2022-04-01 18:39:12 +01:00
427c8695fe Change signs and prefactors for conserved current to mimic the 5d
version.
2022-04-01 16:20:21 +01:00
9e82c468ab Multiplication of diagonal mass in exponentiate fixed for gpus 2022-04-01 15:54:43 +01:00
603fd96747 Missing link multiplication added. 2022-04-01 10:58:56 +01:00
fe993c0836 /=2 replaced by *=0.5 2022-03-31 17:08:17 +01:00
cdf31d52c1 GaugeGrid and typo fixed 2022-03-31 17:04:35 +01:00
0542eaf1da First version of conserved current contraction for Wilson type quarks 2022-03-31 17:02:09 +01:00
317bdcf158 nerscio parametrization 2022-03-24 13:10:47 +01:00
387397374a Current run options 2022-03-23 16:35:11 -04:00
9ca2c98882 Merge branch 'develop' of https://github.com/paboyle/Grid into mbruno-eclover 2022-03-22 15:31:37 +01:00
605cf401e1 Merge branch 'feature/sumd-npr' into develop 2022-03-16 22:43:12 +00:00
f99c3660d2 Merge branch 'feature/cpu-threaded-smp' into develop 2022-03-16 22:07:54 +00:00
92a83a9eb3 Performance improve for Tesseract 2022-03-16 17:14:36 +00:00
53ae01a34a Merge pull request #1 from fjosw/feature/eclover
Feature/eclover
2022-03-15 15:23:35 +01:00
b615fa0f35 Merge pull request #388 from fjosw/feature/sumd-npr
Feature/sumd npr
2022-03-15 09:05:57 -04:00
76c294a7ba open bc fix 2022-03-08 13:55:16 +01:00
0c0c2b1e20 Unnecessary arguments of CloverHelpers::Exponentiate_Clover removed. 2022-03-08 09:44:51 +00:00
e2fc3a0f04 Merge pull request #28 from paboyle/develop
Sync with Upstream
2022-03-08 09:58:51 +01:00
451e7972fd Reintroduced explicit inversion of the Clover term in case of the
CompactExpClover because of the open boundary O(a) improvement. Changed
the timing output to GridLogDebug
2022-03-07 17:43:33 +00:00
56c089d347 Removed leftover comments 2022-03-07 16:40:20 +00:00
acf740e44d Merge pull request #1 from FelixPGZiegler/feature/eclover
Feature/eclover
2022-03-07 16:25:11 +00:00
182f513404 Merge remote-tracking branch 'fjosw/feature/eclover' into feature/eclover 2022-03-07 15:22:04 +00:00
d5b2323a57 included Cayley-Hamilton exponentiation for the compact Wilson exp clover, bug fix for inverse of exp clover 2022-03-07 14:44:24 +00:00
bad18d4417 Merge branch 'paboyle:develop' into feature/eclover 2022-03-07 13:54:10 +00:00
bb5c16b97f New scripts 2022-03-03 17:00:37 -05:00
0d80eeb545 small DDHMC update 2022-03-03 16:56:02 -05:00
d1decee4cc Cleaned up unused variables in Lattice_reduction_gpu.h 2022-03-02 16:54:23 +00:00
d4ae71b880 sum_gpu_large and sum_gpu templates added. 2022-03-02 15:40:18 +00:00
b0f4eee78b New files 2022-03-01 19:09:13 -05:00
5340e50427 HMC running with new formulation 2022-03-01 17:10:25 -05:00
e16fc5b2e4 Threaded intranode comms transfer - ideally between NUMA domains 2022-03-01 11:17:24 -05:00
9aac1e6d64 Merge branch 'develop' into feature/sumd-npr 2022-03-01 10:51:38 -05:00
3e882f555d Large / small sumD options 2022-03-01 08:54:45 -05:00
438caab25f generate_instantiations.sh now correctly produces instantiations for CompactClover variant, redundant instantiations removed. 2022-02-27 18:27:18 +00:00
239e2c1ee6 tests: wilson clover cg tests now include compact variant as well as
exponential wilson clover operators
2022-02-27 18:26:34 +00:00
013dc2ef33 tests: core tests for wilson clover and wilson exp clover including
compact version extended/added
2022-02-27 18:13:47 +00:00
9616811c3d Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2022-02-24 22:03:05 +01:00
8a3002c03b separate left and right masses for CayleyFermion5D 2022-02-24 22:02:56 +01:00
0f1c5b08a1 Dirichlet filters running on AMD and now integrated in Fermion op 2022-02-23 19:29:28 -05:00
70988e43d2 Passes multinode dirichlet test with boundaries at
node boundary or at the single rank boundary
2022-02-23 01:42:14 -05:00
71034f828e attempt to fix broken WilsonExpClover; Compact version still broken will be replaced by F.Joswig 2022-02-23 01:02:27 +01:00
aab3bcb46f Dirichlet first cut - wrong answers on dagger multiply.
Struggling to get a compute node so changing systems
2022-02-22 19:58:33 +00:00
11437930c5 cleaned up definitions of wilsonclover fermions 2022-02-22 10:45:16 +01:00
3d44aa9cb9 cleaned up cloverhelpers; fixed test compact_clover which runs 2022-02-22 01:10:19 +01:00
2851870d70 expClover support via helpers template class 2022-02-22 00:05:43 +01:00
da06d15f73 Merge branch 'feature/feature/staggered-comms' into develop 2022-02-17 04:58:50 +00:00
e8b1251b8c Staggered fix finished 2022-02-17 04:51:13 +00:00
fad5a74a4b Bug fix to detection case 2022-02-15 10:27:39 -05:00
e83f6a6ae9 Merge branch 'develop' into feature/feature/staggered-comms 2022-02-15 08:52:39 -05:00
6283d11d50 Add the comment line to tell the existance of copied data/buffer 2022-02-08 15:22:06 +00:00
6616d5d090 Commit 2022-02-02 16:38:24 -05:00
0bd83cdbda Fixes for Nc!=3 Nersc IO, Gauge and Gauge_NCxNC compatible with GLU. Trace normalisation changed in places removing explicit threes. Guards against non-su3 tests and tests failing when LIME is not compiled. 2021-11-28 21:51:03 +01:00
42d56ea6b6 Verbosity 2021-10-29 02:23:08 +01:00
0b905a72dd Better reduction for GPUs 2021-10-29 02:22:22 +01:00
196 changed files with 14785 additions and 3125 deletions

View File

@ -44,14 +44,22 @@ directory
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC

View File

@ -44,7 +44,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Tracing.h>
//#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/Allocator.h>

View File

@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);

View File

@ -16,6 +16,7 @@
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>

View File

@ -14,7 +14,11 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
#endif
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")

View File

@ -54,6 +54,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>

View File

@ -58,6 +58,7 @@ public:
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
GRID_TRACE("ConjugateGradient");
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
@ -117,9 +118,13 @@ public:
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
RealD usecs = -usecond();
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
GridStopWatch IterationTimer;
IterationTimer.Start();
c = cp;
MatrixTimer.Start();
@ -152,31 +157,41 @@ public:
LinearCombTimer.Stop();
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
IterationTimer.Stop();
if ( (k % 500) == 0 ) {
std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
} else {
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
}
// Stopping condition
if (cp <= rsq) {
usecs +=usecond();
SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
GridBase *grid = src.Grid();
RealD DwfFlops = (1452. )*grid->gSites()*4*k
+ (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
<< "\tComputed residual " << std::sqrt(cp / ssq)
<< "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl;
std::cout << GridLogIterative << "Time breakdown "<<std::endl;
std::cout << GridLogIterative << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -49,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
RealD TrueResidual;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
@ -68,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
@ -97,6 +99,7 @@ NAMESPACE_BEGIN(Grid);
FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
@ -130,6 +133,7 @@ NAMESPACE_BEGIN(Grid);
(*guesser)(src_f, sol_f);
//Inner CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
@ -150,6 +154,7 @@ NAMESPACE_BEGIN(Grid);
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TrueResidual = CG_d.TrueResidual;
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;

View File

@ -44,7 +44,7 @@ public:
using OperatorFunction<Field>::operator();
RealD Tolerance;
// RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@ -52,7 +52,7 @@ public:
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
@ -84,6 +84,7 @@ public:
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GRID_TRACE("ConjugateGradientMultiShift");
GridBase *grid = src.Grid();
@ -182,6 +183,9 @@ public:
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
///////////////////////////////////////
// Timers
@ -321,8 +325,8 @@ public:
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;

View File

@ -0,0 +1,410 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//Linop to add shift to input linop, used in cleanup CG
namespace ConjugateGradientMultiShiftMixedPrecSupport{
template<typename Field>
class ShiftedLinop: public LinearOperatorBase<Field>{
public:
LinearOperatorBase<Field> &linop_base;
RealD shift;
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
axpy(out, shift, in, out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
};
};
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF r_f(SinglePrecGrid);
FieldF p_f(SinglePrecGrid);
FieldF tmp_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
FieldF src_f(SinglePrecGrid);
precisionChange(src_f, src_d);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
r_f=src_f; //residual maintained in single
p_f=src_f;
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
//MdagM+m[0]
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
axpy(mmp_f,mass[0],p_f,mmp_f);
RealD rn = norm2(p_f);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_f,b,mmp_f,r_f);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
a = c /cp;
//Update double precision search direction by residual
PrecChangeTimer.Start();
precisionChange(r_d, r_f);
PrecChangeTimer.Stop();
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_d[s],a,ps_d[s],r_d);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
}
}
}
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
d=real(innerProduct(p_f,mmp_f));
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_f,mass[0],p_f,mmp_f);
AXPYTimer.Stop();
RealD rn = norm2(p_f);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update double precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
}
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
AXPYTimer.Stop();
c = c_f;
if(k % ReliableUpdateFreq == 0){
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
c = c_d;
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -73,6 +73,7 @@ public:
}
void operator()(const FieldD &src, FieldD &psi) {
GRID_TRACE("ConjugateGradientReliableUpdate");
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;

View File

@ -113,7 +113,43 @@ public:
blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard();
};
};
void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
int Nevec = (int)evec_coarse.size();
int Nsrc = (int)src.size();
// make temp variables
std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());
//Preporcessing
std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
guess_coarse[j] = Zero();
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockProject(src_coarse[j],src[j],subspace);
}
//deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
for (int i=0;i<Nevec;i++)
{
std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
const CoarseField & tmp = evec_coarse[i];
for (int j=0;j<Nsrc;j++)
{
axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
}
}
//postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard();
}
};
};

View File

@ -44,6 +44,7 @@ public:
int, MinRes); // Must restart
};
//This class is the input parameter class for some testing programs
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@ -145,16 +146,24 @@ public:
LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
//As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
//To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
//out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
//NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
RealD coarse_relax_tol=5.0e3,
int largestEvalIdxForReport=-1)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
_coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport)
{ };
//evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
@ -177,12 +186,26 @@ public:
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
RealD tmp_eval;
ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
}
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
//This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
//applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
//evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
//As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
//We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
evalMaxApprox = 1.0; //cf above
GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@ -201,13 +224,13 @@ public:
eval = vnum/vden;
fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
<<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1;
return 0;
}
@ -285,6 +308,10 @@ public:
evals_coarse.resize(0);
};
//The block inner product is the inner product on the fine grid locally summed over the blocks
//to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
//vectors under the block inner product. This step must be performed after computing the fine grid
//eigenvectors and before computing the coarse grid eigenvectors.
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -328,6 +355,8 @@ public:
}
}
//While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
@ -376,25 +405,31 @@ public:
evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid);
}
//cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
//cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
//relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
Chebyshev<FineField> Cheby(cheby_op); //Chebyshev of fine operator on fine grid
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
Chebyshev<FineField> ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0;
//Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -405,6 +440,14 @@ public:
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
}
}
//Get the fine eigenvector 'i' by reconstruction
void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
blockPromote(evec_coarse[i],evec,subspace);
eval = evals_coarse[i];
}
};
NAMESPACE_END(Grid);

View File

@ -29,6 +29,8 @@ template<class Field> class PowerMethod
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na;

View File

@ -40,7 +40,7 @@ void MemoryManager::PrintBytes(void)
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils

View File

@ -3,8 +3,14 @@
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
#define MAXLINE 512
static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
//#define dprintf(...) printf (__VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
////////////////////////////////////////////////////////////
@ -104,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -112,7 +118,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@ -126,7 +132,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
mprintf("MemoryManager: Evict(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
@ -137,7 +143,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@ -150,7 +156,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@ -165,7 +171,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
@ -241,7 +247,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
dprintf("ViewOpen found entry %lx %lx : %ld %ld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,

View File

@ -53,10 +53,11 @@ public:
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension;
Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processor_coor; // linear processor coordinate
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
@ -97,14 +98,16 @@ public:
int BossRank(void) ;
int ThisRank(void) ;
const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ShmGrid(void) { return _shm_processors; } ;
const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ;
int ProcessorCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
static void BarrierWorld(void);
////////////////////////////////////////////////////////////
// Reduction
@ -142,16 +145,16 @@ public:
int bytes);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,
int recv_from_rank,int do_recv,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,
int recv_from_rank,int do_recv,
int bytes,int dir);

View File

@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
@ -124,12 +124,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
shm_processors [pad+d]=parent._shm_processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -154,6 +155,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@ -335,22 +337,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
int dest, int dox,
void *recv,
int from,
int from, int dor,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
int dest,int dox,
void *recv,
int from,
int from,int dor,
int bytes,int dir)
{
int ncomm =communicator_halo.size();
@ -370,38 +372,41 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
double off_node_bytes=0.0;
int tag;
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
}
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
}
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
/* if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
* this->StencilSendToRecvFromComplete(list,dir);
* list.resize(0);
* }
*/
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
// std::cout << "Copy Synchronised\n"<<std::endl;
acceleratorCopySynchronise();
StencilBarrier();// Synch shared memory on a single nodes
int nreq=list.size();
@ -438,6 +443,10 @@ int CartesianCommunicator::RankWorld(void){
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BarrierWorld(void){
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,

View File

@ -45,12 +45,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
_shm_processors = Coordinate(processors.size(),1);
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_shm_processors = Coordinate(processors.size(),1);
_processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension);
@ -102,6 +104,7 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@ -111,18 +114,18 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,
int recv_from_rank,int dor,
int bytes, int dir)
{
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,
int recv_from_rank,int dor,
int bytes, int dir)
{
return 2.0*bytes;

View File

@ -93,9 +93,10 @@ public:
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
// Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world

View File

@ -152,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
@ -165,8 +165,8 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
}
static inline int divides(int a,int b)
{
@ -221,7 +221,7 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
@ -294,7 +294,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
Coordinate HyperCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM = ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
@ -341,7 +342,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
@ -353,6 +354,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
SHM=ShmDims;
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
@ -520,7 +523,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;

View File

@ -48,9 +48,10 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
optimal_comm = WorldComm;
SHM = Coordinate(processors.size(),1);
}
////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -46,3 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = lhs.Checkerboard();
autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead);
@ -53,6 +54,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
@ -70,6 +72,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
@ -86,6 +89,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("add");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
@ -106,6 +110,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite);
@ -119,6 +124,7 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite);
@ -133,6 +139,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite);
@ -146,6 +153,7 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("add");
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite);
@ -163,6 +171,7 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@ -177,6 +186,7 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@ -191,6 +201,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@ -204,6 +215,7 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("add");
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite);
@ -218,6 +230,7 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
GRID_TRACE("axpy");
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
@ -231,6 +244,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
}
template<class sobj,class vobj> inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
GRID_TRACE("axpby");
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
@ -246,11 +260,13 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpy_norm");
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpby_norm");
return axpby_norm_fast(ret,a,b,x,y);
}

View File

@ -117,6 +117,7 @@ public:
////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
@ -140,6 +141,7 @@ public:
}
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
@ -163,6 +165,7 @@ public:
}
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);

View File

@ -0,0 +1,55 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_crc.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
{
auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1;
typedef typename vobj::tensor_reduced normtype;
typedef typename normtype::scalar_object scalar;
std::vector<scalar> sff;
sliceSum(ff,sff,mu);
for(int t=0;t<sff.size();t++){
std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
}
}
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -125,6 +125,12 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj>
typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
typename vobj::scalar_object s;
peekSite(s,l,site);
return s;
}
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){

View File

@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
@ -159,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -207,6 +232,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites();
// Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
@ -216,15 +242,31 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, 1,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
});
}
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
});
}
#endif
// This is in single precision and fails some tests
auto anrm = sum(inner_tmp_v,sites);
auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm;
return nrm;
}
@ -258,7 +300,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
// typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x.Grid();
@ -270,17 +312,29 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, 1,{
auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm;
}
@ -434,6 +488,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
}
template<class vobj> inline
std::vector<typename vobj::scalar_object>
sliceSum(const Lattice<vobj> &Data,int orthogdim)
{
std::vector<typename vobj::scalar_object> result;
sliceSum(Data,result,orthogdim);
return result;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)

View File

@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) {
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
#ifdef GRID_CUDA
@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
/*
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
*/
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
exit(EXIT_FAILURE);
return 0;
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
return 1;
}
template <class sobj, class Iterator>
@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
@ -207,7 +207,9 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
@ -218,6 +220,54 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
auto result = buffer_v[0];
return result;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
scalarD *ret_p = (scalarD *)&ret;
const int words = sizeof(vobj)/sizeof(vector);
Vector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
ret_p[w] = sumD_gpu_small(tbuf,osites);
}
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj;
sobj ret;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
if ( ok ) {
ret = sumD_gpu_small(lat,osites);
} else {
ret = sumD_gpu_large(lat,osites);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -230,6 +280,13 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@ -424,9 +424,32 @@ public:
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
#if 1
thread_for( lidx, _grid->lSites(), {
int gidx;
int o_idx;
int i_idx;
int rank;
Coordinate pcoor;
Coordinate lcoor;
Coordinate gcoor;
_grid->LocalIndexToLocalCoor(lidx,lcoor);
pcoor=_grid->ThisProcessorCoor();
_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
});
#else
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int o_idx;
@ -443,6 +466,7 @@ public:
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
#endif
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient

View File

@ -855,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
template<class vobj>
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
{
typedef typename vobj::scalar_object sobj;

View File

@ -65,29 +65,37 @@ GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0);
GridLogError.Active(1);
GridLogWarning.Active(0);
GridLogMessage.Active(1); // at least the messages should be always on
GridLogMemory.Active(0); // at least the messages should be always on
GridLogIterative.Active(0);
GridLogDebug.Active(0);
GridLogPerformance.Active(0);
GridLogDslash.Active(0);
GridLogIntegrator.Active(1);
GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
if (logstreams[i] == std::string("Memory")) GridLogMemory.Active(1);
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
}
}

View File

@ -138,7 +138,8 @@ public:
stream << std::setw(log.topWidth);
}
stream << log.topName << log.background()<< " : ";
stream << log.colour() << std::left;
// stream << log.colour() << std::left;
stream << std::left;
if (log.chanWidth > 0)
{
stream << std::setw(log.chanWidth);
@ -153,9 +154,9 @@ public:
stream << log.evidence()
<< now << log.background() << " : " ;
}
stream << log.colour();
// stream << log.colour();
stream << std::right;
stream.flags(f);
return stream;
} else {
return devnull;
@ -180,8 +181,11 @@ extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern Colours GridLogColours;
std::string demangle(const char* name) ;

View File

@ -31,6 +31,7 @@ directory
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <map>
#include <pwd.h>
@ -654,7 +655,8 @@ class IldgWriter : public ScidacWriter {
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
ildgfmt.field = std::string("su3gauge");
const std::string stNC = std::to_string( Nc ) ;
ildgfmt.field = std::string("su"+stNC+"gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
@ -871,7 +873,8 @@ class IldgReader : public GridLimeReader {
} else {
assert(found_ildgFormat);
assert ( ildgFormat_.field == std::string("su3gauge") );
const std::string stNC = std::to_string( Nc ) ;
assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
@ -879,7 +882,7 @@ class IldgReader : public GridLimeReader {
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.data_type = std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC);
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);

View File

@ -6,8 +6,8 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -182,8 +182,8 @@ class GaugeStatistics
public:
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
header.link_trace=WilsonLoops<Impl>::linkTrace(data);
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
header.link_trace = WilsonLoops<Impl>::linkTrace(data);
header.plaquette = WilsonLoops<Impl>::avgPlaquette(data);
}
};
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
@ -203,20 +203,24 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
assert( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
cm(mu)()(1,1) = adj(cm(mu)()(0,x)) ;
#else
const int x=0 , y=1 , z=2 ; // a little disinenuous labelling
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
#endif
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, Nc-1>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
@ -278,7 +282,6 @@ struct GaugeSimpleMunger{
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
@ -317,8 +320,8 @@ template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
@ -330,8 +333,8 @@ template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
for(int i=0;i<Nc-1;i++){
for(int j=0;j<Nc;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}

View File

@ -9,6 +9,7 @@
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Jamie Hudspith <renwick.james.hudspth@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -30,6 +31,8 @@
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
#include <string>
NAMESPACE_BEGIN(Grid);
using namespace Grid;
@ -39,9 +42,11 @@ using namespace Grid;
////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO {
public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
// Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
@ -145,15 +150,17 @@ public:
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
const int ieee32big = (format == std::string("IEEE32BIG"));
const int ieee32 = (format == std::string("IEEE32"));
const int ieee64big = (format == std::string("IEEE64BIG"));
const int ieee64 = (format == std::string("IEEE64") || \
format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
const std::string stNC = std::to_string( Nc ) ;
if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
@ -164,7 +171,7 @@ public:
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
} else if ( header.data_type == std::string("4D_SU"+stNC+"_GAUGE_"+stNC+"x"+stNC) ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
@ -198,7 +205,7 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
@ -209,27 +216,29 @@ public:
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
writeConfiguration(Umu,file,0,1,ens_label);
writeConfiguration(Umu,file,0,1,ens_label,ens_id,sequence_number);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
int two_row,
int bits32,
std::string ens_label = std::string("DWF"))
std::string ens_label = std::string("DWF"),
std::string ens_id = std::string("UKQCD"),
unsigned int sequence_number = 1)
{
typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = std::string("UKQCD");
header.sequence_number = sequence_number;
header.ensemble_id = ens_id;
header.ensemble_label = ens_label;
header.hdr_version = "1.0" ;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
@ -243,10 +252,14 @@ public:
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
// Sod it -- always write NcxNc double
header.floating_point = std::string("IEEE64BIG");
const std::string stNC = std::to_string( Nc ) ;
if( two_row ) {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE" );
} else {
header.data_type = std::string("4D_SU" + stNC + "_GAUGE_" + stNC + "x" + stNC );
}
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
@ -254,8 +267,15 @@ public:
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
if( two_row ) {
Gauge3x2unmunger<fobj2D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleUnmunger<fobj3D,sobj> munge;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
}
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
@ -287,8 +307,7 @@ public:
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
@ -328,7 +347,7 @@ public:
GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header);
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);

View File

@ -27,10 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
NAMESPACE_BEGIN(Grid);
GridTimePoint theProgramStart = GridClock::now();
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {

View File

@ -72,17 +72,9 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
inline uint64_t cyclecount(void){
return 0;
}
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
#define __SSC_STOP __SSC_MARK(0x110)
#define __SSC_START __SSC_MARK(0x111)
#else
#define __SSC_MARK(mark)
#define __SSC_STOP
#define __SSC_START
/*
* cycle counters arch dependent
*/

View File

@ -35,17 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid)
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
struct timeval tv;
#ifdef TIMERS_ON
gettimeofday(&tv,NULL);
#endif
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
typedef std::chrono::system_clock GridClock;
//typedef std::chrono::system_clock GridClock;
typedef std::chrono::high_resolution_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::seconds GridSecs;
@ -53,6 +44,15 @@ typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridUsecs;
typedef std::chrono::microseconds GridTime;
extern GridTimePoint theProgramStart;
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
auto usecs = std::chrono::duration_cast<GridUsecs>(GridClock::now()-theProgramStart);
return 1.0*usecs.count();
}
inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time)
{
stream << time.count()<<" s";

66
Grid/perfmon/Tracing.h Normal file
View File

@ -0,0 +1,66 @@
#pragma once
#ifdef GRID_TRACING_NVTX
#include <nvToolsExt.h>
class GridTracer {
public:
GridTracer(const char* name) {
nvtxRangePushA(name);
}
~GridTracer() {
nvtxRangePop();
}
};
inline void tracePush(const char *name) { nvtxRangePushA(name); }
inline void tracePop(const char *name) { nvtxRangePop(); }
inline int traceStart(const char *name) { }
inline void traceStop(int ID) { }
#endif
#ifdef GRID_TRACING_ROCTX
#include <roctracer/roctx.h>
class GridTracer {
public:
GridTracer(const char* name) {
roctxRangePushA(name);
std::cout << "roctxRangePush "<<name<<std::endl;
}
~GridTracer() {
roctxRangePop();
std::cout << "roctxRangePop "<<std::endl;
}
};
inline void tracePush(const char *name) { roctxRangePushA(name); }
inline void tracePop(const char *name) { roctxRangePop(); }
inline int traceStart(const char *name) { roctxRangeStart(name); }
inline void traceStop(int ID) { roctxRangeStop(ID); }
#endif
#ifdef GRID_TRACING_TIMER
class GridTracer {
public:
const char *name;
double elapsed;
GridTracer(const char* _name) {
name = _name;
elapsed=-usecond();
}
~GridTracer() {
elapsed+=usecond();
std::cout << GridLogTracing << name << " took " <<elapsed<< " us" <<std::endl;
}
};
inline void tracePush(const char *name) { }
inline void tracePop(const char *name) { }
inline int traceStart(const char *name) { return 0; }
inline void traceStop(int ID) { }
#endif
#ifdef GRID_TRACING_NONE
#define GRID_TRACE(name)
inline void tracePush(const char *name) { }
inline void tracePop(const char *name) { }
inline int traceStart(const char *name) { return 0; }
inline void traceStop(int ID) { }
#else
#define GRID_TRACE(name) GridTracer uniq_name_using_macros##__COUNTER__(name);
#endif

View File

@ -16,8 +16,12 @@
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#endif
#include "pugixml.h"

View File

@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
#define ColourIndex (2)
#define SpinIndex (1)
#define LorentzIndex (0)
#define GparityFlavourIndex (0)
// Also should make these a named enum type
static constexpr int DaggerNo=0;
@ -87,6 +88,8 @@ template<typename T> struct isCoarsened {
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
// ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
@ -110,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVec
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparityFlavourVector = iVector<iScalar<iScalar<vtype> >, Ngp>;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
// Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix;
@ -176,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
//G-parity flavour matrix
typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
// Spin vector
typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF;
@ -220,6 +235,16 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
//G-parity flavour vector
typedef iGparityFlavourVector<Complex > GparityFlavourVector;
typedef iGparityFlavourVector<ComplexF> GparityFlavourVectorF;
typedef iGparityFlavourVector<ComplexD> GparityFlavourVectorD;
typedef iGparityFlavourVector<vComplex > vGparityFlavourVector;
typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF;
typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD;
// singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.

View File

@ -40,9 +40,43 @@ class Action
public:
bool is_smeared = false;
RealD deriv_norm_sum;
RealD deriv_max_sum;
RealD Fdt_norm_sum;
RealD Fdt_max_sum;
int deriv_num;
RealD deriv_us;
RealD S_us;
RealD refresh_us;
void reset_timer(void) {
deriv_us = S_us = refresh_us = 0.0;
deriv_num=0;
deriv_norm_sum = deriv_max_sum=0.0;
Fdt_max_sum = Fdt_norm_sum = 0.0;
}
void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
deriv_max_sum+=max;
deriv_norm_sum+=nrm;
Fdt_max_sum+=Fdt_max;
Fdt_norm_sum+=Fdt_nrm; deriv_num++;
}
RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; };
RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; };
RealD Fdt_max_average(void) { return Fdt_max_sum/deriv_num; };
RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; };
RealD deriv_timer(void) { return deriv_us; };
RealD S_timer(void) { return S_us; };
RealD refresh_timer(void) { return refresh_us; };
void deriv_timer_start(void) { deriv_us-=usecond(); }
void deriv_timer_stop(void) { deriv_us+=usecond(); }
void refresh_timer_start(void) { refresh_us-=usecond(); }
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
// Heatbath?
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters

View File

@ -37,6 +37,10 @@ NAMESPACE_CHECK(ActionSet);
#include <Grid/qcd/action/ActionParams.h>
NAMESPACE_CHECK(ActionParams);
#include <Grid/qcd/action/filters/MomentumFilter.h>
#include <Grid/qcd/action/filters/DirichletFilter.h>
#include <Grid/qcd/action/filters/DDHMCFilter.h>
////////////////////////////////////////////
// Gauge Actions
////////////////////////////////////////////

View File

@ -37,24 +37,33 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams {
Coordinate twists;
GparityWilsonImplParams() : twists(Nd, 0) {};
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
Coordinate dirichlet; // Blocksize of dirichlet BCs
GparityWilsonImplParams() : twists(Nd, 0) { dirichlet.resize(0); };
};
struct WilsonImplParams {
bool overlapCommsCompute;
Coordinate dirichlet; // Blocksize of dirichlet BCs
AcceleratorVector<Real,Nd> twist_n_2pi_L;
AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() {
dirichlet.resize(0);
boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0);
};
WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0);
dirichlet.resize(0);
}
};
struct StaggeredImplParams {
StaggeredImplParams() {};
Coordinate dirichlet; // Blocksize of dirichlet BCs
StaggeredImplParams()
{
dirichlet.resize(0);
};
};
struct OneFlavourRationalParams : Serializable {
@ -63,9 +72,11 @@ struct StaggeredImplParams {
RealD, hi,
int, MaxIter,
RealD, tolerance,
RealD, mdtolerance,
int, degree,
int, precision,
int, BoundsCheckFreq);
int, BoundsCheckFreq,
RealD, BoundsCheckTol);
// MaxIter and tolerance, vectors??
@ -76,16 +87,62 @@ struct StaggeredImplParams {
RealD tol = 1.0e-8,
int _degree = 10,
int _precision = 64,
int _BoundsCheckFreq=20)
int _BoundsCheckFreq=20,
RealD mdtol = 1.0e-6,
double _BoundsCheckTol=1e-6)
: lo(_lo),
hi(_hi),
MaxIter(_maxit),
tolerance(tol),
mdtolerance(mdtol),
degree(_degree),
precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){};
BoundsCheckFreq(_BoundsCheckFreq),
BoundsCheckTol(_BoundsCheckTol){};
};
/*Action parameters for the generalized rational action
The approximation is for (M^dag M)^{1/inv_pow}
where inv_pow is the denominator of the fractional power.
Default inv_pow=2 for square root, making this equivalent to
the OneFlavourRational action
*/
struct RationalActionParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams,
int, inv_pow,
RealD, lo, //low eigenvalue bound of rational approx
RealD, hi, //high eigenvalue bound of rational approx
int, MaxIter, //maximum iterations in msCG
RealD, action_tolerance, //msCG tolerance in action evaluation
int, action_degree, //rational approx tolerance in action evaluation
RealD, md_tolerance, //msCG tolerance in MD integration
int, md_degree, //rational approx tolerance in MD integration
int, precision, //precision of floating point arithmetic
int, BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
// constructor
RationalActionParams(int _inv_pow = 2,
RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD _action_tolerance = 1.0e-8,
int _action_degree = 10,
RealD _md_tolerance = 1.0e-8,
int _md_degree = 10,
int _precision = 64,
int _BoundsCheckFreq=20)
: inv_pow(_inv_pow),
lo(_lo),
hi(_hi),
MaxIter(_maxit),
action_tolerance(_action_tolerance),
action_degree(_action_degree),
md_tolerance(_md_tolerance),
md_degree(_md_degree),
precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){};
};
NAMESPACE_END(Grid);
#endif

View File

@ -68,9 +68,17 @@ public:
///////////////////////////////////////////////////////////////
// Support for MADWF tricks
///////////////////////////////////////////////////////////////
RealD Mass(void) { return mass; };
RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
RealD MassPlus(void) { return mass_plus; };
RealD MassMinus(void) { return mass_minus; };
void SetMass(RealD _mass) {
mass=_mass;
mass_plus=mass_minus=_mass;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void SetMass(RealD _mass_plus, RealD _mass_minus) {
mass_plus=_mass_plus;
mass_minus=_mass_minus;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
} ;
void P(const FermionField &psi, FermionField &chi);
@ -108,7 +116,7 @@ public:
void MeooeDag5D (const FermionField &in, FermionField &out);
// protected:
RealD mass;
RealD mass_plus, mass_minus;
// Save arguments to SetCoefficientsInternal
Vector<Coeff_t> _gamma;
@ -175,16 +183,6 @@ public:
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
void CayleyReport(void);
void CayleyZeroCounters(void);
double M5Dflops;
double M5Dcalls;
double M5Dtime;
double MooeeInvFlops;
double MooeeInvCalls;
double MooeeInvTime;
protected:
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);

View File

@ -0,0 +1,435 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
////////////////////////////////////////////
// Standard Clover
// (4+m0) + csw * clover_term
// Exp Clover
// (4+m0) * exp(csw/(4+m0) clover_term)
// = (4+m0) + csw * clover_term + ...
////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
//////////////////////////////////
// Generic Standard Clover
//////////////////////////////////
template<class Impl>
class CloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
static void Instantiate(CloverField& CloverTerm, CloverField& CloverTermInv, RealD csw_t, RealD diag_mass) {
GridBase *grid = CloverTerm.Grid();
CloverTerm += diag_mass;
int lvol = grid->lSites();
int DimRep = Impl::Dimension;
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
EigenInvCloverOp = EigenCloverOp.inverse();
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Generic Exp Clover
//////////////////////////////////
template<class Impl>
class ExpCloverHelpers: public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef WilsonCloverHelpers<Impl> Helpers;
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void Instantiate(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
Clover *= (1.0/diag_mass);
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
// prepare inverse
CloverInv = (-1.0)*Clover;
Clover = ExpClover * diag_mass;
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
CloverInv = ExpClover * (1.0/diag_mass);
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
//////////////////////////////////
// Compact Standard Clover
//////////////////////////////////
template<class Impl>
class CompactCloverHelpers: public CompactWilsonCloverHelpers<Impl>,
public WilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
Clover += diag_mass;
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
CloverTriangleField& Triangle,
RealD csw_t, RealD diag_mass) {
// Do nothing
}
// TODO: implement Cmunu for better performances with compact layout, but don't do it
// here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
return Helpers::Cmunu(U, lambda, mu, nu);
}
};
//////////////////////////////////
// Compact Exp Clover
//////////////////////////////////
template<class Impl>
class CompactExpCloverHelpers: public CompactWilsonCloverHelpers<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
// do nothing!
// mass term is multiplied to exp(Clover) below
}
static int getNMAX(RealD prec, RealD R) {
/* compute stop condition for exponential */
int NMAX=1;
RealD cond=R*R/2.;
while (cond*std::exp(R)>prec) {
NMAX++;
cond*=R/(double)(NMAX+1);
}
return NMAX;
}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
typedef iMatrix<ComplexD,6> mat;
RealD qn[6];
RealD qnold[6];
RealD p[5];
RealD trA2, trA3, trA4;
mat A2, A3, A4, A5;
A2 = alpha * alpha * arg * arg;
A3 = alpha * arg * A2;
A4 = A2 * A2;
A5 = A2 * A3;
trA2 = toReal( trace(A2) );
trA3 = toReal( trace(A3) );
trA4 = toReal( trace(A4));
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
p[3] = trA3 / 3.0;
p[4] = 0.5 * trA2;
qnold[0] = cN[Niter];
qnold[1] = 0.0;
qnold[2] = 0.0;
qnold[3] = 0.0;
qnold[4] = 0.0;
qnold[5] = 0.0;
for(int i = Niter-1; i >= 0; i--)
{
qn[0] = p[0] * qnold[5] + cN[i];
qn[1] = p[1] * qnold[5] + qnold[0];
qn[2] = p[2] * qnold[5] + qnold[1];
qn[3] = p[3] * qnold[5] + qnold[2];
qn[4] = p[4] * qnold[5] + qnold[3];
qn[5] = qnold[4];
qnold[0] = qn[0];
qnold[1] = qn[1];
qnold[2] = qn[2];
qnold[3] = qn[3];
qnold[4] = qn[4];
qnold[5] = qn[5];
}
mat unit(1.0);
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
GridBase* grid = Diagonal.Grid();
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
//
// Implementation completely in Daniel's layout
//
// Taylor expansion with Cayley-Hamilton recursion
// underlying Horner scheme as above
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++){
cn[i] = cn[i-1] / RealD(i);
}
// Taken over from Daniel's implementation
conformable(Diagonal, Triangle);
long lsites = grid->lSites();
{
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
typedef iMatrix<ComplexD,6> mat;
autoView(diagonal_v, Diagonal, CpuRead);
autoView(triangle_v, Triangle, CpuRead);
autoView(diagonalExp_v, Diagonal, CpuWrite);
autoView(triangleExp_v, Triangle, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
mat srcCloverOpUL(0.0); // upper left block
mat srcCloverOpLR(0.0); // lower right block
mat ExpCloverOp;
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_exp_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_exp_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
int block;
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
// exp(Clover)
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
});
}
Diagonal *= diag_mass;
Triangle *= diag_mass;
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
return lambda;
}
};
NAMESPACE_END(Grid);

View File

@ -31,6 +31,7 @@
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
@ -85,7 +86,7 @@ NAMESPACE_BEGIN(Grid);
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
// = 84 complex words per site
template<class Impl>
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {

View File

@ -138,38 +138,52 @@ typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
// Clover fermions
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Compact Clover fermions
typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;

View File

@ -49,6 +49,8 @@ public:
virtual FermionField &tmp(void) = 0;
virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know
GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

View File

@ -30,6 +30,18 @@ directory
NAMESPACE_BEGIN(Grid);
/*
Policy implementation for G-parity boundary conditions
Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction.
mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
*/
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
public:
@ -113,7 +125,7 @@ public:
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction
//Apply the links
int f_upper = permute_lane ? 1 : 0;
@ -139,10 +151,10 @@ public:
assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code
assert((sl == 1) || (sl == 2));
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
//If this site is an global boundary site, perform the G-parity flavor twist
if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( sl == 2 ) {
//Only do the twist for lanes on the edge of the physical node
ExtractBuffer<sobj> vals(Nsimd);
extract(chi,vals);
@ -197,6 +209,19 @@ public:
reg = memory;
}
//Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
autoView(poke_f0_v, poke_f0, CpuRead);
autoView(poke_f1_v, poke_f1, CpuRead);
autoView(Uds_v, Uds, CpuWrite);
thread_foreach(ss,poke_f0_v,{
Uds_v[ss](0)(mu) = poke_f0_v[ss]();
Uds_v[ss](1)(mu) = poke_f1_v[ss]();
});
}
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{
conformable(Uds.Grid(),GaugeGrid);
@ -207,14 +232,19 @@ public:
GaugeLinkField Uconj(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid);
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
//Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction.
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
for(int mu=0;mu<Nd-1;mu++){
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
U = PeekIndex<LorentzIndex>(Umu,mu);
Uconj = conjugate(U);
// Implement the isospin rotation sign on the boundary between f=1 and f=0
// This phase could come from a simple bc 1,1,-1,1 ..
int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
if ( Params.twists[mu] ) {
@ -229,7 +259,7 @@ public:
thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss]();
});
});
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
@ -260,6 +290,38 @@ public:
});
}
}
{ //periodic / antiperiodic temporal BCs
int mu = Nd-1;
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
GaugeLinkField *Upoke = &U;
if(Params.twists[mu]){ //antiperiodic
Utmp = where(coor == Lmu, -U, U);
Upoke = &Utmp;
}
Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
//Get the barrel-shifted field
Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
Upoke = &Utmp;
if(Params.twists[mu]){
U = where(coor == 0, -Utmp, Utmp); //boundary phase
Upoke = &U;
}
Uconj = conjugate(*Upoke);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -298,28 +360,48 @@ public:
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
int Ls=Btilde.Grid()->_fdimensions[0];
{
autoView( tmp_v , tmp, CpuWrite);
autoView( Atilde_v , Atilde, CpuRead);
autoView( Btilde_v , Btilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
GridBase *GaugeGrid = mat.Grid();
Lattice<iScalar<vInteger> > coor(GaugeGrid);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
autoView( mat_v , mat, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
//Flavor 0
auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
sum = sum + outerProduct(bb,aa);
//Flavor 1
bb = coalescedRead(Btilde_v[sF](1)(spn) );
aa = coalescedRead(Atilde_v[sF](1)(spn) );
sum = sum + conjugate(outerProduct(bb,aa));
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}
};

View File

@ -47,18 +47,6 @@ public:
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////

View File

@ -52,18 +52,6 @@ public:
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////

View File

@ -47,18 +47,6 @@ public:
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////

View File

@ -32,6 +32,7 @@
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
@ -51,7 +52,7 @@ NAMESPACE_BEGIN(Grid);
// csw_r = csw_t to recover the isotropic version
//////////////////////////////////////////////////////////////////
template <class Impl>
template<class Impl, class CloverHelpers>
class WilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>
{

View File

@ -209,6 +209,8 @@ public:
};
////////////////////////////////////////////////////////
template<class Impl> class CompactWilsonCloverHelpers {
public:

View File

@ -47,8 +47,6 @@ class CompactWilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
static constexpr int Nred = Nc * Nhs; // 6
static constexpr int Nblock = Nhs; // 2
static constexpr int Ndiagonal = Nred; // 6

View File

@ -294,11 +294,7 @@ public:
typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector;
void ZeroCountersi(void) { }
void Reporti(int calls) { }
std::vector<int> surface_list;
// Vector<int> surface_list;
WilsonStencil(GridBase *grid,
int npoints,
int checkerboard,
@ -306,11 +302,11 @@ public:
const std::vector<int> &distances,Parameters p)
: CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p)
{
ZeroCountersi();
surface_list.resize(0);
// surface_list.resize(0);
this->same_node.resize(npoints);
};
/*
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
@ -331,7 +327,8 @@ public:
}
}
}
*/
template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
{
@ -398,7 +395,6 @@ public:
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
}
};

View File

@ -74,20 +74,6 @@ public:
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
void Report(void);
void ZeroCounters(void);
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent

View File

@ -75,19 +75,8 @@ public:
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
void Report(void);
void ZeroCounters(void);
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
int Dirichlet;
Coordinate Block;
///////////////////////////////////////////////////////////////
// Implement the abstract base
@ -173,7 +162,10 @@ public:
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams());
virtual void DirichletBlock(const Coordinate & block)
{
}
// Constructors
/*
WilsonFermion5D(int simd,

View File

@ -47,7 +47,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
FiveDimRedBlackGrid,
FourDimGrid,
FourDimRedBlackGrid,_M5,p),
mass(_mass)
mass_plus(_mass), mass_minus(_mass)
{
}
@ -152,65 +152,13 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
{
this->Report();
Coordinate latt = GridDefaultLatt();
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = this->_FourDimGrid->_Nprocessors;
if ( M5Dcalls > 0 ) {
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
// Flops = 10.0*(Nc*Ns) *Ls*vol
RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
// Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
// read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
// write = 1
RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
std::cout << GridLogMessage << "Average bandwidth (GB/s) : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
}
if ( MooeeInvCalls > 0 ) {
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
#else
// Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
#endif
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
{
this->ZeroCounters();
M5Dflops=0;
M5Dcalls=0;
M5Dtime=0;
MooeeInvFlops=0;
MooeeInvCalls=0;
MooeeInvTime=0;
}
template<class Impl>
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
Vector<Coeff_t> diag (Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass;
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
M5D(psi,chi,chi,lower,diag,upper);
}
template<class Impl>
@ -220,8 +168,8 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
Vector<Coeff_t> diag = bs;
Vector<Coeff_t> upper= cs;
Vector<Coeff_t> lower= cs;
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,Din,lower,diag,upper);
}
// FIXME Redunant with the above routine; check this and eliminate
@ -235,8 +183,8 @@ template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &
upper[i]=-ceo[i];
lower[i]=-ceo[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@ -250,8 +198,8 @@ void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &
upper[i]=-cee[i];
lower[i]=-cee[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
@ -266,9 +214,9 @@ void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &
// Assemble the 5d matrix
if ( s==0 ) {
upper[s] = -cee[s+1] ;
lower[s] = mass*cee[Ls-1];
lower[s] = mass_minus*cee[Ls-1];
} else if ( s==(Ls-1)) {
upper[s] = mass*cee[0];
upper[s] = mass_plus*cee[0];
lower[s] = -cee[s-1];
} else {
upper[s]=-cee[s+1];
@ -291,8 +239,8 @@ void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0);
Vector<Coeff_t> lower(Ls,-1.0);
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
upper[Ls-1]=-mass_plus*upper[Ls-1];
lower[0] =-mass_minus*lower[0];
M5Ddag(psi,chi,chi,lower,diag,upper);
}
@ -307,9 +255,9 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
for (int s=0;s<Ls;s++){
if ( s== 0 ) {
upper[s] = cs[s+1];
lower[s] =-mass*cs[Ls-1];
lower[s] =-mass_minus*cs[Ls-1];
} else if ( s==(Ls-1) ) {
upper[s] =-mass*cs[0];
upper[s] =-mass_plus*cs[0];
lower[s] = cs[s-1];
} else {
upper[s] = cs[s+1];
@ -552,7 +500,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass*cee[Ls-1]/bee[0];
leem[i]=mass_minus*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) {
assert(bee[j+1]!=Coeff_t(0.0));
leem[i]*= aee[j]/bee[j+1];
@ -560,7 +508,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
ueem[i]=mass;
ueem[i]=mass_plus;
for(int j=1;j<=i;j++) ueem[i]*= cee[j]/bee[j];
ueem[i]*= aee[0]/bee[0];
@ -573,7 +521,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
}
{
Coeff_t delta_d=mass*cee[Ls-1];
Coeff_t delta_d=mass_minus*cee[Ls-1];
for(int j=0;j<Ls-1;j++) {
assert(bee[j] != Coeff_t(0.0));
delta_d *= cee[j]/bee[j];
@ -642,7 +590,10 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
#if (!defined(GRID_HIP))
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
@ -761,7 +712,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
else q_out += C;
}
#endif
}
template <class Impl>
@ -777,6 +728,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
assert(mu>=0);
assert(mu<Nd);
assert(mass_plus == mass_minus);
RealD mass = mass_plus;
#if 0
int tshift = (mu == Nd-1) ? 1 : 0;
@ -826,7 +779,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
}
#endif
#if (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
////////////////////////////////////////////////
@ -946,7 +898,6 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
InsertSlice(L_Q, q_out, s , 0);
}
#endif
}
#undef Pp
#undef Pm

View File

@ -63,23 +63,18 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
// 10 = 3 complex mult + 2 complex add
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
M5Dcalls++;
M5Dtime-=usecond();
uint64_t nloop = grid->oSites()/Ls;
uint64_t nloop = grid->oSites();
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss= sss*Ls;
uint64_t s = sss%Ls;
uint64_t ss= sss-s;
typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp1, tmp2;
for(int s=0;s<Ls;s++){
uint64_t idx_u = ss+((s+1)%Ls);
uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5m(tmp1,psi(idx_u));
spProj5p(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
}
uint64_t idx_u = ss+((s+1)%Ls);
uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5m(tmp1,psi(idx_u));
spProj5p(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
});
M5Dtime+=usecond();
}
template<class Impl>
@ -105,23 +100,18 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
int Ls=this->Ls;
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
uint64_t nloop = grid->oSites()/Ls;
uint64_t nloop = grid->oSites();
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
uint64_t s = sss%Ls;
uint64_t ss= sss-s;
typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp1,tmp2;
for(int s=0;s<Ls;s++){
uint64_t idx_u = ss+((s+1)%Ls);
uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5p(tmp1,psi(idx_u));
spProj5m(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
}
uint64_t idx_u = ss+((s+1)%Ls);
uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5p(tmp1,psi(idx_u));
spProj5m(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
});
M5Dtime+=usecond();
}
template<class Impl>
@ -142,8 +132,6 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
auto pleem = & leem[0];
auto pueem = & ueem[0];
MooeeInvCalls++;
MooeeInvTime-=usecond();
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -180,8 +168,6 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
coalescedWrite(chi[ss+s],res);
}
});
MooeeInvTime+=usecond();
}
@ -204,10 +190,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
assert(psi.Checkerboard() == psi.Checkerboard());
MooeeInvCalls++;
MooeeInvTime-=usecond();
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -244,7 +226,6 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
coalescedWrite(chi[ss+s],res);
}
});
MooeeInvTime+=usecond();
}

View File

@ -94,10 +94,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
d_p[ss] = diag[s];
}}
M5Dcalls++;
M5Dtime-=usecond();
assert(Nc==3);
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@ -198,7 +194,6 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
}
#endif
});
M5Dtime+=usecond();
}
template<class Impl>
@ -242,8 +237,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
d_p[ss] = diag[s];
}}
M5Dcalls++;
M5Dtime-=usecond();
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
@ -339,7 +332,6 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
}
#endif
});
M5Dtime+=usecond();
}
@ -813,9 +805,6 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
}
assert(_Matp->size()==Ls*LLs);
MooeeInvCalls++;
MooeeInvTime-=usecond();
if ( switcheroo<Coeff_t>::iscomplex() ) {
thread_loop( (auto site=0;site<vol;site++),{
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@ -825,7 +814,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
});
}
MooeeInvTime+=usecond();
}
NAMESPACE_END(Grid);

View File

@ -32,17 +32,18 @@
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
NAMESPACE_BEGIN(Grid);
template<class Impl>
CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, csw_r(_csw_r)
, csw_t(_csw_t)
@ -58,50 +59,55 @@ CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
, BoundaryMask(&Fgrid)
, BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
if (clover_anisotropy.isAnisotropic)
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (open_boundaries)
if (open_boundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->open_boundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
@ -109,8 +115,8 @@ void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& o
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
@ -118,20 +124,20 @@ void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
@ -144,13 +150,13 @@ void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionFiel
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
@ -163,23 +169,23 @@ void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionF
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!open_boundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
@ -251,7 +257,7 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
@ -261,18 +267,18 @@ void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionFi
force += clover_force;
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField& in,
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
@ -285,8 +291,8 @@ void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField&
CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
@ -318,22 +324,27 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
TmpOriginal += this->diag_mass;
// Handle mass term based on clover policy
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
// Convert the data layout of the clover term
double t4 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Possible modify the boundary values
// Exponentiate the clover (nothing happens in case of the standard clover)
double t5 = usecond();
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
// Possible modify the boundary values
double t6 = usecond();
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the clover term in the improved layout
double t6 = usecond();
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
double t7 = usecond();
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
// Fill the remaining clover fields
double t7 = usecond();
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
@ -344,20 +355,19 @@ void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t8 = usecond();
#if 0
std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
<< " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
<< ", allocations = " << (t2 - t1) / 1e6
<< ", field strength = " << (t3 - t2) / 1e6
<< ", fill clover = " << (t4 - t3) / 1e6
<< ", convert = " << (t5 - t4) / 1e6
<< ", boundaries = " << (t6 - t5) / 1e6
<< ", inversions = " << (t7 - t6) / 1e6
<< ", pick cbs = " << (t8 - t7) / 1e6
<< ", total = " << (t8 - t0) / 1e6
<< std::endl;
#endif
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@ -54,8 +54,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
auto nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -71,7 +69,6 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
}
});
this->M5Dtime += usecond();
}
template<class Impl>
@ -91,8 +88,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
auto nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -108,7 +103,6 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
}
});
this->M5Dtime += usecond();
}
template<class Impl>
@ -127,8 +121,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
auto pleem = & this->leem[0];
auto pueem = & this->ueem[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
uint64_t nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -164,7 +156,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
coalescedWrite(chi[ss+s],res);
}
});
this->MooeeInvTime += usecond();
}
template<class Impl>
@ -185,8 +176,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
assert(psi.Checkerboard() == psi.Checkerboard());
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
auto nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -223,7 +212,6 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
}
});
this->MooeeInvTime += usecond();
}
NAMESPACE_END(Grid);

View File

@ -298,45 +298,33 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
int LLs = in.Grid()->_rdimensions[0];
int len = U.Grid()->oSites();
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Remove explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime-=usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime+=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
DhopComputeTime2-=usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime2+=usecond();
}
template<class Impl>
@ -347,22 +335,14 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor;
int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}
/*CHANGE END*/
@ -371,7 +351,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=1;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check
@ -383,7 +362,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=1;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check
@ -395,7 +373,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=2;
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
@ -404,58 +381,6 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Report(void)
{
Coordinate latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
/////////////////////////////////////////////////////////////////////////
// Implement the general interface. Here we use SAME mass on all slices
/////////////////////////////////////////////////////////////////////////

View File

@ -334,7 +334,6 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
template <class Impl>
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
@ -346,7 +345,6 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -359,7 +357,6 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -418,47 +415,33 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
@ -471,78 +454,16 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void ImprovedStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////

View File

@ -55,9 +55,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss = sss*Ls;
@ -73,7 +70,6 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
}
});
this->M5Dtime += usecond();
}
template<class Impl>
@ -99,9 +95,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss = sss*Ls;
@ -122,7 +115,6 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
}
});
this->M5Dtime += usecond();
}
template<class Impl>
@ -143,9 +135,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(), {
uint64_t ss = sss*Ls;
@ -161,8 +150,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
}
});
this->M5Dtime += usecond();
}
template<class Impl>
@ -186,9 +173,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
auto pm = this->pm;
int nloop = grid->oSites()/Ls;
@ -217,7 +201,6 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
}
});
this->M5Dtime += usecond();
}
template<class Impl>
@ -237,9 +220,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -277,7 +257,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
}
});
this->MooeeInvTime += usecond();
}
template<class Impl>
@ -297,8 +276,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
auto pueem= & this->ueem[0];
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -343,7 +320,6 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
}
});
this->MooeeInvTime += usecond();
}
template<class Impl>
@ -363,9 +339,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
auto pleem= & this->leem[0];
auto pueem= & this->ueem[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -402,7 +375,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
coalescedWrite(chi[ss+s],res);
}
});
this->MooeeInvTime += usecond();
}
template<class Impl>
@ -423,9 +395,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls;
@ -469,7 +438,6 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
}
});
this->MooeeInvTime += usecond();
}
NAMESPACE_END(Grid);

View File

@ -263,7 +263,6 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
@ -275,7 +274,6 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -288,7 +286,6 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -345,47 +342,33 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
template <class Impl>
@ -396,78 +379,16 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////

View File

@ -34,8 +34,8 @@
NAMESPACE_BEGIN(Grid);
template<class Impl>
WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField& _Umu,
template<class Impl, class CloverHelpers>
WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
@ -74,8 +74,8 @@ WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField&
}
// *NOT* EO
template <class Impl>
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@ -89,8 +89,8 @@ void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
out += temp;
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField &in, FermionField &out)
{
FermionField temp(out.Grid());
@ -104,8 +104,8 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
out += temp;
}
template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeField &_Umu)
{
double t0 = usecond();
WilsonFermion<Impl>::ImportGauge(_Umu);
@ -131,47 +131,11 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
CloverTerm += Helpers::fillCloverXT(Ex) * csw_t;
CloverTerm += Helpers::fillCloverYT(Ey) * csw_t;
CloverTerm += Helpers::fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass;
double t4 = usecond();
int lvol = _Umu.Grid()->lSites();
int DimRep = Impl::Dimension;
CloverHelpers::Instantiate(CloverTerm, CloverTermInv, csw_t, this->diag_mass);
double t5 = usecond();
{
autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor);
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++){
auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CTIv, lcoor);
});
}
double t6 = usecond();
// Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -184,48 +148,44 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
double t7 = usecond();
double t6 = usecond();
#if 0
std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
<< " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
<< ", allocations = " << (t2 - t1) / 1e6
<< ", field strength = " << (t3 - t2) / 1e6
<< ", fill clover = " << (t4 - t3) / 1e6
<< ", misc = " << (t5 - t4) / 1e6
<< ", inversions = " << (t6 - t5) / 1e6
<< ", pick cbs = " << (t7 - t6) / 1e6
<< ", total = " << (t7 - t0) / 1e6
<< std::endl;
#endif
std::cout << GridLogDebug << "WilsonCloverFermion::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiation = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t6 - t0) / 1e6 << std::endl;
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInvDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{
out.Checkerboard() = in.Checkerboard();
CloverField *Clover;
@ -278,8 +238,8 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
} // MooeeInternal
// Derivative parts unpreconditioned pseudofermions
template <class Impl>
void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
@ -349,7 +309,7 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
@ -360,15 +320,15 @@ void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X,
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
{
assert(0);
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
template<class Impl, class CloverHelpers>
void WilsonCloverFermion<Impl, CloverHelpers>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
assert(0); // not implemented yet
}

View File

@ -60,7 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid)
_tmp(&FiveDimRedBlackGrid),
Dirichlet(0)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
@ -91,6 +92,17 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
}
if ( p.dirichlet.size() == Nd+1) {
Coordinate block = p.dirichlet;
if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
Dirichlet = 1;
Block = block;
}
} else {
Coordinate block(Nd+1,0);
Block = block;
}
if (Impl::LsVectorised) {
int nsimd = Simd::Nsimd();
@ -129,95 +141,20 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
// <<" " << StencilEven.surface_list.size()<<std::endl;
}
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
RealD volume = Ls;
Coordinate latt = _FourDimGrid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion5D<Impl>::ZeroCounters(void) {
DhopCalls = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0;
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
{
GaugeField HUmu(_Umu.Grid());
HUmu = _Umu*(-0.5);
if ( Dirichlet ) {
std::cout << GridLogDslash << " Dirichlet BCs 5d " <<Block<<std::endl;
Coordinate GaugeBlock(Nd);
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
std::cout << GridLogDslash << " Dirichlet BCs 4d " <<GaugeBlock<<std::endl;
DirichletFilter<GaugeField> Filter(GaugeBlock);
Filter.applyFilter(HUmu);
}
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
pickCheckerboard(Even,UmuEven,Umu);
pickCheckerboard(Odd ,UmuOdd,Umu);
@ -259,7 +196,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
const FermionField &B,
int dag)
{
DerivCalls++;
assert((dag==DaggerNo) ||(dag==DaggerYes));
conformable(st.Grid(),A.Grid());
@ -270,15 +206,12 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
DerivCommTime-=usecond();
st.HaloExchange(B,compressor);
DerivCommTime+=usecond();
Atilde=A;
int LLs = B.Grid()->_rdimensions[0];
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) {
////////////////////////////////////////////////////////////////////////
// Flip gamma if dag
@ -290,8 +223,6 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
// Call the single hop
////////////////////////
DerivDhopComputeTime -= usecond();
int Usites = U.Grid()->oSites();
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@ -299,10 +230,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
////////////////////////////
// spin trace outer product
////////////////////////////
DerivDhopComputeTime += usecond();
Impl::InsertForce5D(mat, Btilde, Atilde, mu);
}
DerivComputeTime += usecond();
}
template<class Impl>
@ -360,12 +289,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
DhopTotalTime-=usecond();
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
}
@ -374,6 +301,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
GRID_TRACE("DhopInternalOverlappedComms");
Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0];
@ -382,53 +310,58 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
/////////////////////////////
// Start comms // Gather intranode and extra node differentiated??
/////////////////////////////
DhopFaceTime-=usecond();
st.HaloExchangeOptGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
{
GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor);
accelerator_barrier();
}
std::vector<std::vector<CommsRequest_t> > requests;
auto id=traceStart("Communicate overlapped");
st.CommunicateBegin(requests);
/////////////////////////////
// Overlap with comms
/////////////////////////////
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
{
GRID_TRACE("MergeSHM");
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
}
/////////////////////////////
// do the compute interior
/////////////////////////////
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
DhopComputeTime-=usecond();
if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} else {
GRID_TRACE("DhopInterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
}
DhopComputeTime+=usecond();
/////////////////////////////
// Complete comms
/////////////////////////////
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
traceStop(id);
/////////////////////////////
// do the compute exterior
/////////////////////////////
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
{
GRID_TRACE("Merge");
st.CommsMerge(compressor);
}
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} else {
GRID_TRACE("DhopExterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
}
DhopComputeTime2+=usecond();
}
@ -438,29 +371,30 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
const FermionField &in,
FermionField &out,int dag)
{
GRID_TRACE("DhopInternalSerialComms");
Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0];
{
GRID_TRACE("HaloExchange");
st.HaloExchangeOpt(in,compressor);
}
DhopCommTime-=usecond();
st.HaloExchangeOpt(in,compressor);
DhopCommTime+=usecond();
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
} else {
GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
}
DhopComputeTime+=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{
DhopCalls++;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check
@ -472,7 +406,6 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
template<class Impl>
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls++;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check
@ -484,7 +417,6 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=2;
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
@ -539,12 +471,17 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
LatComplex sk(_grid); sk = Zero();
LatComplex sk2(_grid); sk2= Zero();
LatComplex W(_grid); W= Zero();
LatComplex a(_grid); a= Zero();
LatComplex one (_grid); one = ScalComplex(1.0,0.0);
LatComplex cosha(_grid);
LatComplex kmu(_grid);
LatComplex Wea(_grid);
LatComplex Wema(_grid);
LatComplex ea(_grid);
LatComplex ema(_grid);
LatComplex eaLs(_grid);
LatComplex emaLs(_grid);
LatComplex ea2Ls(_grid);
LatComplex ema2Ls(_grid);
LatComplex sinha(_grid);
LatComplex sinhaLs(_grid);
LatComplex coshaLs(_grid);
@ -579,39 +516,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
////////////////////////////////////////////
cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}
}
Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) );
sinha = 0.5*(exp( a) - exp(-a));
sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
ea = (cosha + sqrt(cosha*cosha-one));
ema= (cosha - sqrt(cosha*cosha-one));
eaLs = pow(ea,Ls);
emaLs= pow(ema,Ls);
ea2Ls = pow(ea,2.0*Ls);
ema2Ls= pow(ema,2.0*Ls);
Wea= abs(W) * ea;
Wema= abs(W) * ema;
// a=log(ea);
sinha = 0.5*(ea - ema);
sinhaLs = 0.5*(eaLs-emaLs);
coshaLs = 0.5*(eaLs+emaLs);
A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
F = eaLs * (one - Wea + (Wema - one) * mass*mass);
F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
F = F - abs(W) * sinha * 4.0 * mass;
Bpp = (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
Bmm = (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
App = (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
Amm = (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
Bpp = (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
Bmm = (A/F) * (one - ea2Ls) * (one - Wea) * (one - mass*mass * one);
App = (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one);
Amm = (A/F) * (one - ea2Ls) * ea * (ea - abs(W)) * (one - mass*mass * one);
ABpm = (A/F) * abs(W) * sinha * 2.0 * (one + mass * coshaLs * 2.0 + mass*mass * one);
//P+ source, P- source
@ -634,29 +561,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
buf1_4d = Zero();
ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
//G(s,t)
bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d;
//A++*exp(a(s+t))
bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ;
//A+-*exp(a(s-t))
bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ;
//A-+*exp(a(-s+t))
bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ;
//A--*exp(a(-s-t))
bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ;
//GL
buf2_4d = Zero();
ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
//G(s,t)
bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d;
//B++*exp(a(s+t))
bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ;
//B+-*exp(a(s-t))
bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ;
//B-+*exp(a(-s+t))
bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ;
//B--*exp(a(-s-t))
bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ;
}
InsertSlice(bufR_4d, GR, (ss-1), 0);
InsertSlice(bufL_4d, GL, (ss-1), 0);
@ -775,28 +702,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
W = one - M5 + sk2;
////////////////////////////////////////////
// Cosh alpha -> alpha
// Cosh alpha -> exp(+/- alpha)
////////////////////////////////////////////
cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}}
Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) );
Wea = abs(W)*(cosha + sqrt(cosha*cosha-one));
Wema= abs(W)*(cosha - sqrt(cosha*cosha-one));
num = num + ( one - Wema ) * mass * in;
denom= ( Wea - one ) + mass*mass * (one - Wema);

View File

@ -4,12 +4,13 @@ Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
Copyright (C) 2015
Copyright (C) 2022
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Fabian Joswig <fabian.joswig@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -75,91 +76,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
StencilOdd.BuildSurfaceList(1,vol4);
}
template<class Impl>
void WilsonFermion<Impl>::Report(void)
{
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
RealD volume = 1;
Coordinate latt = _grid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
// how to count flops here?
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
// how to count flops here?
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion<Impl>::ZeroCounters(void) {
DhopCalls = 0; // ok
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0; // ok
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{
@ -319,7 +235,6 @@ template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField &mat, const FermionField &A,
const FermionField &B, int dag) {
DerivCalls++;
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
@ -328,11 +243,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
FermionField Atilde(B.Grid());
Atilde = A;
DerivCommTime-=usecond();
st.HaloExchange(B, compressor);
DerivCommTime+=usecond();
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) {
////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag
@ -340,7 +252,6 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
int gamma = mu;
if (!dag) gamma += Nd;
DerivDhopComputeTime -= usecond();
int Ls=1;
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@ -348,9 +259,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
// spin trace outer product
//////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
DerivDhopComputeTime += usecond();
}
DerivComputeTime += usecond();
}
template <class Impl>
@ -397,7 +306,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
@ -409,7 +317,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -422,7 +329,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
@ -487,14 +393,12 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in,
FermionField &out, int dag)
{
DhopTotalTime-=usecond();
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerial(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
}
template <class Impl>
@ -503,6 +407,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
const FermionField &in,
FermionField &out, int dag)
{
GRID_TRACE("DhopOverlapped");
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
@ -513,53 +418,55 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
/////////////////////////////
std::vector<std::vector<CommsRequest_t> > requests;
st.Prepare();
DhopFaceTime-=usecond();
st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
{
GRID_TRACE("Gather");
st.HaloGather(in,compressor);
}
DhopCommTime -=usecond();
tracePush("Communication");
st.CommunicateBegin(requests);
/////////////////////////////
// Overlap with comms
/////////////////////////////
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond();
{
GRID_TRACE("MergeSHM");
st.CommsMergeSHM(compressor);
}
/////////////////////////////
// do the compute interior
/////////////////////////////
int Opt = WilsonKernelsStatic::Opt;
DhopComputeTime-=usecond();
if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} else {
GRID_TRACE("DhopInterior");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
}
DhopComputeTime+=usecond();
/////////////////////////////
// Complete comms
/////////////////////////////
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
tracePop("Communication");
{
GRID_TRACE("Merge");
st.CommsMerge(compressor);
}
/////////////////////////////
// do the compute exterior
/////////////////////////////
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} else {
GRID_TRACE("DhopExterior");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
}
DhopComputeTime2+=usecond();
};
@ -569,20 +476,22 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in,
FermionField &out, int dag)
{
GRID_TRACE("DhopSerial");
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
DhopCommTime-=usecond();
st.HaloExchange(in, compressor);
DhopCommTime+=usecond();
{
GRID_TRACE("HaloExchange");
st.HaloExchange(in, compressor);
}
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} else {
GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
}
DhopComputeTime+=usecond();
};
/*Change ends */
@ -599,11 +508,47 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
Current curr_type,
unsigned int mu)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
Gamma g5(Gamma::Algebra::Gamma5);
conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid());
assert(0);
auto UGrid= this->GaugeGrid();
PropagatorField tmp_shifted(UGrid);
PropagatorField g5Lg5(UGrid);
PropagatorField R(UGrid);
PropagatorField gmuR(UGrid);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
g5Lg5=g5*q_in_1*g5;
tmp_shifted=Cshift(q_in_2,mu,1);
Impl::multLinkField(R,this->Umu,tmp_shifted,mu);
gmuR=gmu*R;
q_out=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
tmp_shifted=Cshift(q_in_1,mu,1);
Impl::multLinkField(g5Lg5,this->Umu,tmp_shifted,mu);
g5Lg5=g5*g5Lg5*g5;
R=q_in_2;
gmuR=gmu*R;
q_out-=adj(g5Lg5)*R;
q_out-=adj(g5Lg5)*gmuR;
}
@ -617,9 +562,51 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
if(curr_type != Current::Vector)
{
std::cout << GridLogError << "Only the conserved vector current is implemented so far." << std::endl;
exit(1);
}
int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid());
assert(0);
auto UGrid= this->GaugeGrid();
PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid);
PropagatorField L(UGrid);
PropagatorField zz (UGrid);
zz=Zero();
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
};
Gamma gmu=Gamma(Gmu[mu]);
tmp = Cshift(q_in,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = ( Utmp*lattice_cmplx - gmu*Utmp*lattice_cmplx ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
q_out = where((lcoor<=tmax),tmp,zz); // Position of current complicated
tmp = q_in *lattice_cmplx;
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -( Utmp + gmu*Utmp );
// Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
q_out+= where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
}
NAMESPACE_END(Grid);

View File

@ -72,20 +72,15 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
if (SE->_is_local) { \
int perm= SE->_permute; \
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
spProj(chi,tmp); \
} else if ( st.same_node[Dir] ) { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
if (SE->_is_local || st.same_node[Dir] ) { \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
} \
spProj(chi,tmp); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
} \
acceleratorSynchronise();
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
if (!SE->_is_local ) { \
auto chi = coalescedRead(buf[SE->_offset],lane); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
@ -416,19 +411,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody
}
#define KERNEL_CALL_TMP(A) \
const uint64_t NN = Nsite*Ls; \
auto U_p = & U_v[0]; \
auto in_p = & in_v[0]; \
auto out_p = & out_v[0]; \
auto st_p = st_v._entries_p; \
auto st_perm = st_v._permute_type; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \
@ -440,6 +422,16 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
#define KERNEL_CALL_EXT(A) \
const uint64_t NN = Nsite*Ls; \
const uint64_t sz = st.surface_list.size(); \
auto ptr = &st.surface_list[0]; \
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
int sF = ptr[ss]; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
});
#define ASM_CALL(A) \
thread_for( ss, Nsite, { \
int sU = ss; \
@ -460,7 +452,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif

View File

@ -9,6 +9,7 @@
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -32,10 +33,12 @@
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion<IMPLEMENTATION>;
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -8,7 +8,8 @@
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Mattia Bruno <mattia.bruno@cern.ch>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
@ -31,10 +32,12 @@
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h>
#include <Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonCloverFermion<IMPLEMENTATION>;
template class WilsonCloverFermion<IMPLEMENTATION, CloverHelpers<IMPLEMENTATION>>;
template class WilsonCloverFermion<IMPLEMENTATION, ExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1,51 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
#ifndef AVX512
#ifndef QPX
#ifndef A64FX
#ifndef A64FXFIXEDSIZE
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
#endif
#endif
#endif
#endif
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class WilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -18,6 +18,10 @@ WILSON_IMPL_LIST=" \
GparityWilsonImplF \
GparityWilsonImplD "
COMPACT_WILSON_IMPL_LIST=" \
WilsonImplF \
WilsonImplD "
DWF_IMPL_LIST=" \
WilsonImplF \
WilsonImplD \
@ -40,13 +44,23 @@ EOF
done
CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation"
for impl in $WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
CC_LIST="CompactWilsonCloverFermionInstantiation"
for impl in $COMPACT_WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
@ -63,14 +77,14 @@ for impl in $DWF_IMPL_LIST $GDWF_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done
# overwrite the .cc file in Gparity directories
for impl in $GDWF_IMPL_LIST
do
ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
ln -f -s ../WilsonKernelsInstantiationGparity.cc.master $impl/WilsonKernelsInstantiation$impl.cc
done
@ -84,7 +98,7 @@ for impl in $STAG_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done

View File

@ -0,0 +1,115 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//--------------------------------------------------------------------
#pragma once
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////
// DDHMC filter with sub-block size B[mu]
////////////////////////////////////////////////////
template<typename GaugeField>
struct DDHMCFilter: public MomentumFilterBase<GaugeField>
{
Coordinate Block;
int Width;
DDHMCFilter(const Coordinate &_Block,int _Width=2): Block(_Block) { Width=_Width; }
void applyFilter(GaugeField &U) const override
{
GridBase *grid = U.Grid();
Coordinate Global=grid->GlobalDimensions();
GaugeField zzz(grid); zzz = Zero();
LatticeInteger coor(grid);
auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
////////////////////////////////////////////////////
// Zero BDY layers
////////////////////////////////////////////////////
std::cout<<GridLogMessage<<" DDHMC Force Filter Block "<<Block<<" width " <<Width<<std::endl;
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
LatticeCoordinate(coor,mu);
////////////////////////////////
// OmegaBar - zero all links contained in slice B-1,0 and
// mu links connecting to Omega
////////////////////////////////
if ( Width==1) {
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
if ( Width==2) {
U = where(mod(coor,B1)==Integer(B1-2),zzz,U);
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
U = where(mod(coor,B1)==Integer(1) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-3),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
if ( Width==3) {
U = where(mod(coor,B1)==Integer(B1-3),zzz,U);
U = where(mod(coor,B1)==Integer(B1-2),zzz,U);
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
U = where(mod(coor,B1)==Integer(1) ,zzz,U);
U = where(mod(coor,B1)==Integer(2) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-4),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
if ( Width==4) {
U = where(mod(coor,B1)==Integer(B1-4),zzz,U);
U = where(mod(coor,B1)==Integer(B1-3),zzz,U);
U = where(mod(coor,B1)==Integer(B1-2),zzz,U);
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
U = where(mod(coor,B1)==Integer(1) ,zzz,U);
U = where(mod(coor,B1)==Integer(2) ,zzz,U);
U = where(mod(coor,B1)==Integer(3) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-5),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
}
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,71 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/integrators/DirichletFilter.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//--------------------------------------------------------------------
#pragma once
NAMESPACE_BEGIN(Grid);
template<typename MomentaField>
struct DirichletFilter: public MomentumFilterBase<MomentaField>
{
typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
typedef iScalar<iScalar<iScalar<vector_type> > > ScalarType; //complex phase for each site
Coordinate Block;
DirichletFilter(const Coordinate &_Block): Block(_Block){}
void applyFilter(MomentaField &P) const override
{
GridBase *grid = P.Grid();
typedef decltype(PeekIndex<LorentzIndex>(P, 0)) LatCM;
////////////////////////////////////////////////////
// Zero strictly links crossing between domains
////////////////////////////////////////////////////
LatticeInteger coor(grid);
LatCM zz(grid); zz = Zero();
for(int mu=0;mu<Nd;mu++) {
if ( (Block[mu]) && (Block[mu] <= grid->GlobalDimensions()[mu] ) ) {
// If costly could provide Grid earlier and precompute masks
std::cout << GridLogMessage << " Dirichlet in mu="<<mu<<std::endl;
LatticeCoordinate(coor,mu);
auto P_mu = PeekIndex<LorentzIndex>(P, mu);
P_mu = where(mod(coor,Block[mu])==Integer(Block[mu]-1),zz,P_mu);
PokeIndex<LorentzIndex>(P, P_mu, mu);
}
}
}
};
NAMESPACE_END(Grid);

View File

@ -37,7 +37,8 @@ NAMESPACE_BEGIN(Grid);
template<typename MomentaField>
struct MomentumFilterBase{
virtual void applyFilter(MomentaField &P) const;
virtual void applyFilter(MomentaField &P) const = 0;
virtual ~MomentumFilterBase(){};
};
//Do nothing
@ -83,7 +84,6 @@ struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
}
};

View File

@ -69,6 +69,11 @@ public:
return PeriodicBC::ShiftStaple(Link,mu);
}
//Same as Cshift for periodic BCs
static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
return PeriodicBC::CshiftLink(Link,mu,shift);
}
static inline bool isPeriodicGaugeField(void) { return true; }
};
@ -110,6 +115,11 @@ public:
return PeriodicBC::CovShiftBackward(Link, mu, field);
}
//If mu is a conjugate BC direction
//Out(x) = U^dag_\mu(x-mu) | x_\mu != 0
// = U^T_\mu(L-1) | x_\mu == 0
//else
//Out(x) = U^dag_\mu(x-mu mod L)
static inline GaugeLinkField
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
{
@ -129,6 +139,13 @@ public:
return PeriodicBC::CovShiftIdentityForward(Link,mu);
}
//If mu is a conjugate BC direction
//Out(x) = S_\mu(x+mu) | x_\mu != L-1
// = S*_\mu(x+mu) | x_\mu == L-1
//else
//Out(x) = S_\mu(x+mu mod L)
//Note: While this is used for Staples it is also applicable for shifting gauge links or gauge transformation matrices
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
{
assert(_conjDirs.size() == Nd);
@ -138,6 +155,27 @@ public:
return PeriodicBC::ShiftStaple(Link,mu);
}
//Boundary-aware C-shift of gauge links / gauge transformation matrices
//For conjugate BC direction
//shift = 1
//Out(x) = U_\mu(x+\hat\mu) | x_\mu != L-1
// = U*_\mu(0) | x_\mu == L-1
//shift = -1
//Out(x) = U_\mu(x-mu) | x_\mu != 0
// = U*_\mu(L-1) | x_\mu == 0
//else
//shift = 1
//Out(x) = U_\mu(x+\hat\mu mod L)
//shift = -1
//Out(x) = U_\mu(x-\hat\mu mod L)
static inline GaugeLinkField CshiftLink(const GaugeLinkField &Link, int mu, int shift){
assert(_conjDirs.size() == Nd);
if(_conjDirs[mu])
return ConjugateBC::CshiftLink(Link,mu,shift);
else
return PeriodicBC::CshiftLink(Link,mu,shift);
}
static inline void setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
static inline std::vector<int> getDirections(void) { return _conjDirs; }
static inline bool isPeriodicGaugeField(void) { return false; }

Some files were not shown because too many files have changed in this diff Show More