1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-24 01:34:47 +01:00

Compare commits

..

1592 Commits

Author SHA1 Message Date
Christopher Kelly
4fefae1745 Test_evec_compression changes:
Added ability to choose one of a variety of preselected basis sizes from the command line
	Fine lanczos now checks enough evecs are generated and resizes the output to Nstop and not the actual amount that converged (which can be larger)
2022-04-06 06:33:26 -07:00
Christopher Kelly
758e2edcad Test_evec_compression enhancements:
In testing the compressed evecs, a Cheybshev smoothing is now applied first to remove high mode noise
	Added a second test where the uncompressed evecs are compared directly to the original evecs
	Generalized the test to allow for either DWF or Mobius with or without GPBC, switched by command line options
2022-03-29 06:16:15 -07:00
Christopher Kelly
1538b15f3b 48ID evo main program now uses reliable update CG 2022-03-14 06:45:28 -07:00
Christopher Kelly
deac621c2c Merge branch 'develop' into gparity_HMC_merge_develop 2022-02-22 14:25:27 -05:00
Peter Boyle
63dbaeefaa Extra barrier prior to finalize just in case it fixes an issue on Tursa 2022-02-16 14:01:43 +00:00
Peter Boyle
e8c187b323 SyCL happier? 2022-02-15 11:24:38 -05:00
Peter Boyle
0c1618197f Faster intranode MPI works now 2022-02-15 08:52:07 -05:00
Peter Boyle
f49d5c2d22 Updated scripts for crusher 2022-02-14 17:55:16 -05:00
Peter Boyle
a3b022d469 Crusher compile 2022-02-14 15:09:08 -05:00
Christopher Kelly
ba974960e6 Added an HMC checkpoint start option that loads the fields and then reseeds the RNGs, suitable for creating new evolution streams
Added option to choose RNG seeds in 40ID main binary
2022-02-14 08:09:01 -08:00
Peter Boyle
48772f0976 Merge pull request #384 from jdmaia/hip_launchbounds
Changing thread block order and adding launch_bounds
2022-02-14 11:08:28 -05:00
Peter Boyle
c322420580 Dont instantiate an Nc=3 and non-GP hardwired code for other implementations 2022-02-14 16:04:08 +00:00
Julio Maia
86f4e17928 Changing thread block order and adding launch_bounds 2022-02-07 11:29:37 -06:00
Peter Boyle
215df671be Merge pull request #382 from DanielRichtmann/feature/compact-clover
Compact Clover Fermions
2022-02-01 21:45:38 -05:00
Daniel Richtmann
1b6b12589f Get splitting up into implementation and instantiation files correct 2022-02-02 00:51:11 +01:00
Daniel Richtmann
3082ab8252 Check in compact version of wilson clover fermions 2022-02-02 00:50:05 +01:00
Daniel Richtmann
add86cd7f4 Abandon ET for clover application, use construct similar to multLink 2022-02-01 23:09:06 +01:00
Daniel Richtmann
0b6fd20c54 Enable memory coalescing in clover term generation 2022-02-01 23:09:06 +01:00
Daniel Richtmann
e83423fee6 Refactor clover to align with other files and prepare for upcoming changes 2022-02-01 23:09:06 +01:00
Daniel Richtmann
b4f8e87982 Have Grid's cli interface understand floats 2022-02-01 23:09:06 +01:00
Christopher Kelly
6755dc57f8 Added methods to compute spatial plaquette and timeslice spatial plaquette to WilsonLoops 2022-01-24 13:57:39 -05:00
Christopher Kelly
aa620ca52c Fixed compilation error in observables resulting from changes in Wilson flow code
Modified light quark mass on 40ID HMC binary
2022-01-24 09:56:24 -08:00
Christopher Kelly
2c46c942cc Reworked WilsonFlow:
Both smear and smear_adaptive now maintain the Wilson flow time as a function variable rather than a class member variable. smear_adaptive does likewise for the current time step. This allows the evolve and smear functions to be const
	Fixed smear_adaptive setting initial time to epsilon rather than 0
	Added ability to assign generic measurement actions at user specified frequencies during the smearing and reimplemented current energy density / topq output in this framework
	Reimplemented the "flowMeasure" methods using the above framework
Fixed const correctness for WilsonLoops::TopologicalCharge
2022-01-24 12:06:05 -05:00
Christopher Kelly
adeba8059a Added calculation of timeslice topological charge 2022-01-20 14:29:07 -05:00
Christopher Kelly
c4ac528126 Added cloverleaf energy density calculation to WilsonFlow 2021-12-27 10:33:33 -05:00
Christopher Kelly
551b93ba8e To HMC/Mobius2p1fIDSDRGparityEOFA_40ID, added input param to change trajectory length and increased integrator steps for DSDR 2021-12-10 09:06:06 -08:00
Peter Boyle
135808dcfa Less verbose 2021-12-07 16:24:24 -05:00
Peter Boyle
7f7d06d963 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-12-07 09:06:42 -08:00
Peter Boyle
2bf3b4d576 Update to reduce memory footpring in benchmark test 2021-12-07 09:02:02 -08:00
Christopher Kelly
ddf7540510 Added calculation of 5Li topological charge
WilsonFlow code now calls topological charge calculation with correct gauge implementation rather than assuming periodic
Added version of WilsonFlow::flowMeasureEnergyDensityPlaquette that outputs the smeared gauge field at the end
2021-12-06 17:56:42 -05:00
Christopher Kelly
de68d12c3d 1x1 topological charge calculation now respects gauge boundary conditions 2021-12-06 13:42:09 -05:00
Peter Boyle
f34d34bd17 2 nodes 2021-11-22 22:27:16 -05:00
Peter Boyle
e32d5141b4 Updated to make MPI reliable still gives good perf, but MPI will be slow
intranode
2021-11-22 21:46:31 -05:00
Peter Boyle
6d5277f2d7 Update to Spock 2021-11-22 20:58:02 -05:00
Peter Boyle
14d82777e0 Best modules for spock 2021-11-22 20:47:16 -05:00
Peter Boyle
2a4e739513 Enable XGMI copy (need to rename nvlink to cover NVLINK/XGMI/XeLink) 2021-11-22 20:46:09 -05:00
Peter Boyle
8079dc2a14 Cray MPI not working right yet 2021-11-22 20:45:44 -05:00
Peter Boyle
6ceb556684 Intranode asynch hipMemCopy 2021-11-22 20:45:12 -05:00
Peter Boyle
76cde73705 HIP improvements on messaging and intranode hipMemCopyAsynch 2021-11-22 20:44:39 -05:00
Christopher Kelly
6d26a2a1ad Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into gparity_HMC 2021-11-16 07:32:47 -08:00
Christopher Kelly
a1211cdcce Gparity 48ID tuning and exposure of trajectory length as input variable 2021-11-16 07:31:41 -08:00
Peter Boyle
cc094366a9 Merge pull request #375 from JPRichings/develop
Lattice object ACCcache probe
2021-11-09 18:19:32 -05:00
41a575ff9b Format edit 2021-11-09 21:56:23 +00:00
12ef413065 fix to deflation.h 2021-11-09 21:20:36 +00:00
829a328451 remove deflation timing 2021-11-09 20:46:57 +00:00
402523c62e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-11-09 12:57:40 +00:00
d7bef70b5c Helper functions to allow probe of cache state of lattice objects. 2021-11-09 12:57:09 +00:00
2ad1811642 Added timing to deflation code. 2021-11-09 12:33:25 +00:00
Christopher Kelly
e78acf77ff To LocalCoherenceLanczos, added a method to reconstruct the fine eigenvector and added some comments to aid the user
Added a test code for local coherence Lanczos with G-parity BCs
Added a test code for block eigenvector compression
2021-11-08 07:26:35 -08:00
a65a497bae Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-29 13:01:34 +01:00
b27b12828e reverse previous "fix", missing statement was probably intentional, added a comment to that effect 2021-10-29 13:01:31 +01:00
Peter Boyle
fe9edf8526 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-29 02:03:27 +01:00
Peter Boyle
44204c7e06 Extra code 2021-10-29 02:02:56 +01:00
Peter Boyle
33b3789598 Merge pull request #364 from AndrewYongZhenNing/develop
CayleyFermion5D Conserved current fix
2021-10-27 20:27:20 -04:00
Peter Boyle
195ab2888d Merge branch 'develop' into develop 2021-10-27 20:26:57 -04:00
Peter Boyle
85f750d753 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-27 00:28:05 +01:00
Peter Boyle
a4ce6e42c7 Warning free compile on make all and make tests under nvcc 2021-10-27 00:27:03 +01:00
Peter Boyle
5398b7e7e3 Max 128 size 2021-10-26 09:16:29 -07:00
fd13a3f2be Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-26 10:45:46 +01:00
c144b32368 deflation timers 2021-10-26 10:37:24 +01:00
Peter Boyle
ba7e371b90 Warning free compile on Tursa.
Hopefully got all reqd virtual dtors
2021-10-21 19:56:52 +01:00
Peter Boyle
99e7a5d18a Merge pull request #371 from edbennett/hmc-documentation-update
update documentation for GenericHMCRunner - thanks
2021-10-18 14:36:43 -04:00
Christopher Kelly
f7e9621492 40ID ensemble tuning: now use 5 Hasenbusch steps, parameters now separately tunable in param file 2021-10-18 08:17:36 -07:00
f824d99059 update documentation for GenericHMCRunner 2021-10-18 09:50:16 +01:00
Peter Boyle
749b8022a4 Linear operator and SparseMatrix virtual destructors 2021-10-15 20:47:18 +01:00
Peter Boyle
7e0057d2c4 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-10-15 20:46:51 +01:00
Peter Boyle
cfe9e870d3 Stream 2021-10-15 20:46:44 +01:00
Christopher Kelly
f14be15f8b Updates to Gparity HMC main programs 2021-10-15 08:10:17 -07:00
Peter Boyle
e9c4f06cbf Merge pull request #370 from fjosw/bugfix/gpu_sum_shm
Error Handling sum_Dgpu large objects
2021-10-14 09:12:47 -04:00
1f9688417a Error message added when attempting to sum object which is too large for
the shared memory
2021-10-13 20:45:46 +01:00
Christopher Kelly
6a3aaa52ef Test_dwf_lanczos can now run either G-parity Mobius or non-Gparity DWF according to cmdline switch
Fixed copyStream intialization
2021-10-12 12:59:54 -07:00
Peter Boyle
16c2a99965 Overlap cudamemcpy - didn't set up stream right 2021-10-11 13:31:26 -07:00
Peter Boyle
cda915a345 Better options 2021-10-07 20:29:09 +01:00
Peter Boyle
7c16189e16 Merge pull request #368 from Heinrich-BR/develop
Accelerated Pick-Set Checkerboard functions
2021-10-07 15:13:09 -04:00
Peter Boyle
ecbfccea43 Merge pull request #369 from paboyle/gauge-group-covariance
expose gauge group in GImpl and generic Nc fix
2021-10-07 15:11:12 -04:00
Peter Boyle
a8eda8f6da Summit scripts 2021-10-05 21:22:10 -04:00
Peter Boyle
9b1a0653cf Summit results 2021-10-05 21:22:01 -04:00
Peter Boyle
7cb1ff7395 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 20:13:42 -04:00
Peter Boyle
ab6ea29913 Print removal 2021-10-05 20:13:25 -04:00
b5c81a02b6 Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-10-05 21:13:01 +01:00
d899ee80fc skip record fixed to include norm metadata 2021-10-05 21:12:47 +01:00
Peter Boyle
4016e705fc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-10-05 14:56:57 -04:00
Peter Boyle
2f4e85e5d6 Summit set up 2021-10-05 14:56:17 -04:00
Peter Boyle
8ed0b57b09 Memory verbose and tracking, shrink default cache
Print PCI device IDs on node 0
2021-10-05 11:41:03 -04:00
a976fa6746 expose gauge group in GImpl and generic Nc fix 2021-10-05 14:19:47 +01:00
6c66b8d997 deflated guesser can optionally be used with less vectors than provided 2021-09-30 19:25:12 +01:00
Christopher Kelly
9ba47b4696 Merge branch 'develop' into gparity_HMC 2021-09-29 20:07:55 -07:00
Christopher Kelly
e85af80c39 Added return value checks on all cuda api calls
Test_dwf_lanczos can now run with either regular DWF or Mobius+Gparity based on cmdline arg
2021-09-29 19:57:43 -07:00
9523ad3d73 vector version of Schur solver use vector guesser 2021-09-28 12:45:47 +01:00
73a95fa96f LinearFunction loops over vectors by default, can be overloaded 2021-09-28 12:44:26 +01:00
Christopher Kelly
0b91e90dd4 Merge branch 'develop' into feature/gparity_HMC 2021-09-27 07:16:26 -07:00
7e130076d6 Fixed line left behind 2021-09-24 17:26:31 +01:00
6efdad6f21 Removed Halo benchmark 2021-09-24 17:18:04 +01:00
a822c48565 Added accelerated pick-set checkerboard functions 2021-09-24 17:13:25 +01:00
014fb76e88 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-09-24 16:45:25 +01:00
30e5311b43 Update from the gods upstream 2021-09-24 16:39:56 +01:00
Peter Boyle
67e08aa952 New file not run yet 2021-09-23 23:39:55 +02:00
Peter Boyle
ed1f20f3a1 Merge pull request #367 from mmphys/bugfix/H5NS
Hdf5 namespace
2021-09-23 12:36:11 -04:00
Peter Boyle
cffc736bb3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-22 06:03:06 -07:00
Peter Boyle
c0d56a1c04 Perlmutter tune up 2021-09-22 06:02:34 -07:00
Peter Boyle
3206f69478 SYCL happy 2021-09-21 18:01:35 -07:00
Peter Boyle
b2ccaad761 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 12:18:05 -07:00
Peter Boyle
8eb1232683 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 09:25:07 -07:00
Peter Boyle
c6ce3ad03b Some properties 2021-09-21 09:20:21 -07:00
Peter Boyle
b3b033d343 Clean 2021-09-21 09:18:54 -07:00
Peter Boyle
ca9816bfbb Typo 2021-09-21 04:12:04 +02:00
Peter Boyle
814d5abc7e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-21 04:05:51 +02:00
Peter Boyle
a29122e2bf Rebench 2021-09-21 04:05:04 +02:00
Peter Boyle
e188c0512e Udpdate 2021-09-21 01:04:30 +02:00
Peter Boyle
1fb6aaf150 Device 2 Device with cudaMemcpy 2021-09-21 01:03:07 +02:00
Peter Boyle
894654f7ef Simplificatoin, always gather faces 2021-09-21 01:02:34 +02:00
Peter Boyle
109507888b Option to force use of MPI over Nvlink 2021-09-21 00:53:25 +02:00
Peter Boyle
68650b61fe Options controlling behaviour 2021-09-21 00:51:01 +02:00
Michael Marshall
7ee66bf453 Make sure H5NS has empty definition if HDF5 built without C++ namespace. Add comment in Hdf5IO.cc indicating likely source of error using H5NS, i.e. lack of --enable-cxx in hdf5 configure. 2021-09-19 19:45:20 +01:00
Peter Boyle
8bd70ad8b5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-16 10:22:38 -07:00
Peter Boyle
af98525766 Merge pull request #359 from paboyle/feature/serialisation-update
Feature/serialisation update
2021-09-16 10:24:52 -04:00
Peter Boyle
1c2f218519 Merge pull request #360 from pjgeorg/ld-nvcc-openmp
nvcc: Add -fopenmp to LDFLAGS
2021-09-16 10:24:30 -04:00
Peter Boyle
c9aa1f507c Merge pull request #363 from felixerben/feature/testMesonField
Feature/test meson field
2021-09-16 10:23:58 -04:00
Peter Boyle
ea7126496d Merge pull request #361 from edbennett/fix-setdevice-message
make message about setdevice consistent with configure script
2021-09-16 10:23:37 -04:00
Peter Boyle
f660dc67e4 Merge pull request #366 from lehner/feature/gpt
Avx512 mixed prec
2021-09-15 20:27:13 -04:00
Christoph Lehner
ede8faea74 Merge branch 'paboyle:develop' into feature/gpt 2021-09-16 02:23:15 +02:00
Christoph Lehner
1b750761c2 Merge pull request #26 from waterret/feature/gpt
AVX512 drop mixed precision as well
2021-09-16 02:22:52 +02:00
Peter Boyle
145acf2919 Perf results 2021-09-16 01:06:28 +01:00
Peter Boyle
cc4a27b9e6 Scripts and performance 2021-09-16 00:15:35 +01:00
Peter Boyle
b4690e6091 Adding build basics for different systems 2021-09-16 00:00:38 +01:00
Luchang Jin
4b24800132 AVX512 drop mixed precision as well 2021-09-15 16:29:47 -04:00
Peter Boyle
9d2238148c Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-09-15 19:25:57 +01:00
Peter Boyle
c15493218d Two extra routines to break out SchurRedBlack on many RHS into stages to allow efficient deflation & split grid
Split grid solver still to do.
2021-09-15 19:24:39 +01:00
Peter Boyle
001a556a34 Merge pull request #365 from lehner/feature/gpt
Sync
2021-09-15 13:34:02 -04:00
Christoph Lehner
3d0f88e702 A64FX drop mixed precision as well 2021-09-15 18:38:32 +02:00
Christoph Lehner
dd091d0960 consistent pointer offloading instead of views 2021-09-15 16:58:05 +02:00
Christoph Lehner
e2abbf9520 Merge pull request #25 from paboyle/develop
Sync
2021-09-15 10:02:43 +02:00
Peter Boyle
c7baeb5bae Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-09-14 08:31:11 -07:00
Peter Boyle
402d80e197 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-09-14 16:16:06 +01:00
Peter Boyle
86e33c8ab2 Significant GPU perf speed up finished 2021-09-14 16:14:23 +01:00
Peter Boyle
5dae6a6dac Deprecate half prec comms 2021-09-14 15:06:59 +01:00
Peter Boyle
361bb8a101 Remove half prec comms 2021-09-14 15:06:29 +01:00
Peter Boyle
7efdb3cd2b Remove half prec comms 2021-09-14 15:06:06 +01:00
Peter Boyle
65ef4ec29f Move tables to device memory 2021-09-14 15:05:01 +01:00
Peter Boyle
d5835c0222 Switch to coalesced stencil face gather 2021-09-14 15:04:14 +01:00
Peter Boyle
a7b943b33e Remove half prec comms 2021-09-14 05:05:33 +01:00
Peter Boyle
7440cde92f No half prec comms; coalesced access on GPU 2021-09-14 05:04:56 +01:00
Peter Boyle
0fc662bb24 Dirac cuda 11.4 happy ; force host for functions accessing mult table
ET runs these on host BEFORE lodging result in AST for kernel
2021-09-14 05:00:44 +01:00
Peter Boyle
8195890640 Force MPI over NVLINK 2021-09-14 05:00:17 +01:00
Peter Boyle
4c88104a73 Fix compile warns 2021-09-11 23:08:05 +01:00
Peter Boyle
73b944c152 Drop half prec comms for now. 2021-09-11 23:07:18 +01:00
Peter Boyle
d1b0b7f5c6 Half prec comms dropping 2021-09-11 23:05:40 +01:00
Peter Boyle
381d8797d0 Drop half prec comms for now 2021-09-11 23:05:02 +01:00
Christopher Kelly
d184b8c921 Merge branch 'develop' into gparity_HMC 2021-09-08 06:14:08 -07:00
Christopher Kelly
c92e390b08 Added initial main binary code for 40ID and 48ID Gparity HMC 2021-09-08 09:00:13 -04:00
11ee8a1061 Merge remote-tracking branch 'upstream/develop' into develop 2021-09-02 16:57:42 +01:00
Peter Boyle
b06526bc1e Comment update 2021-08-30 21:15:39 -04:00
Peter Boyle
3044419111 Some sample code 2021-08-30 20:32:11 -04:00
Peter Boyle
bcfa9cf068 Improvement of output 2021-08-28 08:08:15 -07:00
Peter Boyle
114920b8de Some example clean up 2021-08-25 12:24:17 +01:00
Peter Boyle
0d588b95f4 Bug fix to Example_Laplacian test 2021-08-23 23:14:26 +01:00
Peter Boyle
5b3c530aa7 Return value 2021-08-23 15:30:45 +01:00
Peter Boyle
c6a5499c8b Fail on non-apple 2021-08-22 18:40:55 +01:00
Peter Boyle
ec9c3fe77a Remove the file 2021-08-22 18:28:39 +01:00
Peter Boyle
6135ad530e Extra examples / solutions 2021-08-22 18:25:07 +01:00
Peter Boyle
40098424c7 Examples 2021-08-22 14:17:12 +01:00
Peter Boyle
7163b31a26 Examples 2021-08-20 01:15:23 +01:00
Peter Boyle
ffbdd91e0e Apple happiness 2021-08-20 01:15:00 +01:00
Peter Boyle
5d29e175d8 Typo fix 2021-08-10 18:25:43 +01:00
Peter Boyle
417dbfa257 Fix 2021-08-10 08:55:35 -07:00
peterx.a.boyle
1eda4d8e0b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-08-10 05:41:18 -07:00
peterx.a.boyle
50181f16e5 Level 0 IPC set up 2021-08-10 05:35:15 -07:00
Peter Boyle
75030637cc Improved comms benchmark, same as benchmark_comms_host_device 2021-08-10 05:16:30 -07:00
Peter Boyle
fe5aaf7677 Make comms benchmark same as Benchmark_comms_host_device 2021-08-09 04:06:30 -07:00
Peter Boyle
80ac2a73ca Check is wrong (HtoD / DtoH) 2021-08-05 18:33:20 -04:00
Andrew Yong
770680669d Whitespace removal. 2021-08-04 09:21:59 +01:00
Andrew Yong
0cdfc5cf22 Merge remote-tracking branch 'upstream/develop' into develop 2021-07-30 14:40:55 +01:00
Christopher Kelly
5b36a8af54 Added a CshiftLink function to the GaugeImplementations and boundary condition classes that offers a boundary aware C-shift
Modified gauge fixing code to use CshiftLink internally such that the steepest descent algorithm is universal
Modified gauge transformation code to use CshiftLink for a universal definition
Improved comprehensibility of Test_fft_gfix and generalized to use either periodic or charge conjugation BCs based on cmdline option
Added cmdline options to Test_fft_gfix to tune alpha and optionally disable the Fourier acceleration tests
2021-07-12 17:13:40 -04:00
d75a66a3e6 test done 2021-07-06 11:42:36 +01:00
fcc4374d7b i/o done 2021-07-05 14:52:00 +01:00
67c3c16fe5 working test 2021-07-05 14:41:52 +01:00
25e9be50b5 created test file 2021-07-02 15:51:19 +01:00
Christopher Kelly
75a1f85162 Added method to compute and return the Wilson flow energy density over some number of steps 2021-06-30 17:24:00 -04:00
428b8ba907 Updated from upstream and added halo benchmark 2021-06-29 01:05:12 +01:00
323cf6c038 make message consistent with configure script 2021-06-23 17:00:43 +01:00
Peter Boyle
29a22ae603 Simpler SYCL setup 2021-06-22 17:57:20 +00:00
Peter Boyle
403bff1a47 Force reqd subgroup size fo SYCL 2021-06-22 17:56:10 +00:00
Christoph Lehner
c50f27e68b Make FFT play nice with split grid 2021-06-20 11:34:38 +02:00
Peter Georg
80afacec5b nvcc: Add -fopenmp to LDFLAGS 2021-06-17 13:05:13 +02:00
Peter Boyle
6cd9224dd7 SYCL comms buffer allocate 2021-06-16 17:10:55 +00:00
Peter Boyle
4bf8196ff1 Merge branch 'develop' of https://www.github.com/paboyle/Grid into develop 2021-06-15 21:45:36 +00:00
Peter Boyle
4c5440fb06 const happy for sycl 2021-06-15 21:45:07 +00:00
Christopher Kelly
ac4f2d9798 Fixed EOFA approx test square rooting the result inappropriately thus failing when it shouldn't
To MDWF+ID GPBC evol main program, added routine to compute the lower bound of the EOFA using the power method with a command line toggle
2021-06-09 09:08:37 -04:00
a269a3d919 Merge pull request #358 from mmphys/feature/serialisation-test
Add a ragged std::vector to the serialisation test
2021-06-09 10:16:25 +01:00
Michael Marshall
0c4f585496 Test nested std::vector<grid tensor> 2021-06-08 00:05:35 +01:00
Michael Marshall
33d2df46a0 Merge branch 'develop' into feature/serialisation-test
* develop:
  Update README.md
  removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)
2021-06-07 23:25:38 +01:00
Michael Marshall
2df308f649 Add a ragged vector to the serialisation tests. NB: Already had nested (regular) std::vector<std::vector<...>> 2021-06-07 23:25:07 +01:00
Peter Boyle
92def28bd3 Update README.md 2021-06-06 04:52:05 -04:00
ca10bfa1c7 removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore) 2021-06-04 11:12:22 +01:00
298a6ec51e Merge pull request #357 from mmphys/bugfix/ragged
Bugfix/ragged Multi-dimensional ragged vectors
2021-06-04 10:34:46 +01:00
Christopher Kelly
c3b99de33f In EOFA pseudofermion action, implemented M^{-1} (this costs the same as M for EOFA!)
Added tests/solver/Test_eofa_inv.cc to test the above
In MDWF+ID GPBC binary, tests of RHMC approx for the action / MD approxs can be performed separately using a cmdline toggle
2021-06-03 11:11:14 -04:00
Michael Marshall
e5dbe488a6 Merge branch 'develop' into bugfix/ragged
* develop:
  Remove synch
2021-06-03 08:25:56 +01:00
Peter Boyle
0e27e3847d Remove synch 2021-06-03 04:24:19 +00:00
Christopher Kelly
e1a02bb80a Added main program to reproduce 32ID ensemble with 240MeV pions and GPBC
Allowed EOFA to accept different solvers for the L and R operations in the heatbath step
Fixed EOFA Meofa operating on member Phi rather than input field
Added derived EOFA pseudofermion variant that allows for mixed prec CG to be used in the heatbath
Added forces/Test_mobius_gparity_eofa_mixed testing the above reproduces the regular EOFA
To Test_gamma, added checks for the various properties of the charge conjugation matrix C=-gamma2*gamma4 in Grid basis
2021-06-01 11:44:34 -04:00
Michael Marshall
393727b93b Documentation update (briefly) covering serialisation changes. For review 2021-06-01 15:49:37 +01:00
Michael Marshall
2b1fcd78c3 Fixes post review with Peter: a) Correct bug in isRegularShape - detect 3d matrix where 1st slice is 2x2 and second slice is 2x1; b) Synchronisation of EigenResizeCounter done by checking we're the OMP primary thread; c) Move definition of EigenResizeCounter to new file, BaseIO.cc 2021-05-31 22:24:54 +01:00
Michael Marshall
0a4e0b49a0 BaseIO: Added "EigenResizeCounter" to keep track of any allocations/deallocations to Eigen tensors during readback. On read, if the tensor is resized, EigenResizeCounter += delta memory (in bytes) 2021-05-31 12:49:56 +01:00
Michael Marshall
76af169f05 Add global namespace to Writer<T> and Reader<T> inside GRID_SERIALIZABLE_CLASS_MEMBERS (so that "using Grid" not necessary).
Fix issue with output of Grid::iMatrix so that M<3>{{148,149,150,} {151,152,153,} {154155156}} becomes M<3>{{148,149,150} {151,152,153} {154,155,156}}
2021-05-31 08:43:02 +01:00
Michael Marshall
7b89232251 Extended HDF5 serialisation of std::vector<T> where T now also includes Grid scalar/vector/matrix
Changed VectorUtils element traits to is_flattenable, because: a) contract changed on what it does; and b) no other Grid dependencies on element. Needs review.
Initial tests work ... needs proper regression testing.
2021-05-30 20:27:53 +01:00
Peter Boyle
b5aeae526f Make Cshift fields static to avoid repeated reallocaate overhead 2021-05-28 16:33:08 +02:00
Michael Marshall
ef0ddd5d04 std::vector serialisation in hdf5 uses a different format if the vector is ragged. When reading back std::vector we need to check which format we're reading (since we don't know a priori) and this involves looking for attributes that may not exist. The c++ API: a) throws; and b) prints voluminous logging. Switched to non-throwing, non-logging, C version of the API after code review. 2021-05-24 18:43:55 +01:00
Michael Marshall
9b73dacf50 First row might still be ragged if multi dimensional. attrExists() doesn't throw, but easier to wrap in try ... catch than to explain in comment. 2021-05-22 04:34:32 +01:00
Michael Marshall
244b4aa07f Serialise std::vector of numeric types as multidimensional object if size is regular ... or individually if ragged 2021-05-21 20:08:56 +01:00
Christopher Kelly
86f08c6b9a Added a check that the initial EOFA action agrees with |eta|^2, thus checking the quality of the rational approximation in the heatbath 2021-05-18 13:57:44 -04:00
Christopher Kelly
9f0271039f Completed implementation of Meofa method of ExactOneFlavourRatio pseudofermion action
Added tests to tests/forces/Test_mobius_force_eofa.cc testing that the EOFA heatbath results in Phi = M^{-1/2} eta
2021-05-18 12:27:51 -04:00
Christopher Kelly
24df770f74 Added tests/IO/Test_field_array_io.cc testing/demonstrating parallel IO of an array of 5D fermion fields 2021-05-13 12:32:45 -04:00
Christopher Kelly
45b6c7effc Added a test code forces/Test_gpdwf_force_1f_2f that compares the action and force for DWF, EOFA and DSDR actions between the 1f and 2f implementations of G-parity BCs
Broke up ExactOneFlavourRatio refresh into a virtual routine that generates eta and one that uses it as with the ratio and RHMC actions
Added accessors to the pseudofermion field to TwoFlavourEvenOddRatio and ExactOneFlavourRatio
2021-05-12 16:34:07 -04:00
Quadro
1c70d8c4d9 Warning remove 2021-05-05 19:56:04 -04:00
Quadro
f0e9a5299f Happy on GCC I hope 2021-05-05 19:55:34 -04:00
Quadro
f1b8ba45e7 Warning on GCC suppress unrelated to my code so why doesn't it shut up about its ABI fix 2021-05-05 19:54:21 -04:00
Peter Boyle
fe998ab578 Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC 2021-05-05 17:36:51 -04:00
Peter Boyle
c2ee2b5fd1 Random chhanges 2021-05-05 17:36:38 -04:00
Peter Boyle
3b734ee397 two point function example 2021-05-05 17:36:19 -04:00
Peter Boyle
8637a9512a Freeze Gaussian implementation 2021-05-05 17:34:54 -04:00
Peter Boyle
7f6e2ee03e Drop normal_distribution, standardise 2021-05-05 17:34:17 -04:00
u61464
8cfc7342cd staggered hand unroll read coalesce 2021-05-05 14:17:18 -07:00
Peter Boyle
7b02acb2bd Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC 2021-05-04 13:45:11 -04:00
Peter Boyle
86948c6ea0 CRC for finger print fields - aids debug / version diff 2021-05-04 13:44:38 -04:00
Peter Boyle
53d226924a CRC added 2021-05-04 13:44:07 -04:00
Christopher Kelly
80176b1b39 RHMC now outputs some initial norms to the logs
Fixed DWF+I Gparity binaries not correctly assigning twist directions (thanks Peter!)
2021-05-04 13:12:23 -04:00
u61464
15ae317858 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-05-04 08:40:38 -07:00
u61464
834f536b5f Fastest option on SyCL is now std::complex 2021-05-04 08:40:18 -07:00
Christopher Kelly
29ddafd0fc Added variant of G-parity DWF+I ensemble gen code using double prec RHMC 2021-04-30 13:12:24 -04:00
Peter Boyle
c332d9f08b Merge pull request #356 from felixerben/bugfix/stoutSmearing
Jamie's fix
2021-04-27 14:10:49 -04:00
cf2923d5dd Jamie's fix 2021-04-27 16:53:37 +01:00
Peter Boyle
0e4413ddde Merge pull request #355 from felixerben/bugfix/stoutSmearing
bugfix 3D stout smearing
2021-04-27 08:01:55 -04:00
Peter Boyle
0f08364e4f Mom filter refresh sRNG 2021-04-26 23:18:11 +02:00
Peter Boyle
a198d59381 Merge branch 'feature/gparity_HMC' of https://github.com/paboyle/Grid into feature/gparity_HMC 2021-04-26 21:05:52 +02:00
009ccd581e bugfix 3D stout smearing 2021-04-26 10:36:33 +01:00
Peter Boyle
8cd4263974 Tests compile 2021-04-25 22:20:37 -04:00
Peter Boyle
d45c868656 Change interface 2021-04-25 10:53:34 -04:00
Peter Boyle
955a8113de Expose label only to reduce number of parameters 2021-04-25 10:36:38 -04:00
Peter Boyle
dbe210dd53 Open the ens_id 2021-04-25 10:25:59 -04:00
Peter Boyle
3a4f5f2324 Merge develop, strengthen force tests 2021-04-22 18:54:00 -04:00
Peter Boyle
824d84473f Merge branch 'develop' into feature/gparity_HMC 2021-04-22 16:32:41 -04:00
Peter Boyle
38964a4076 Switch twist direction 2021-04-22 15:57:37 -04:00
Peter Boyle
0d9aa87228 Reduce momentum to the GP plane 2021-04-22 15:56:59 -04:00
Peter Boyle
0e959d9b94 Update plaquette analysis 2021-04-22 15:55:47 -04:00
Peter Boyle
752f70cd48 Merge branch 'develop' into feature/gparity_HMC 2021-04-22 01:58:11 +02:00
54c6b1376d Quick fix of conserved current implementation in CayleyFermion5D. Now function treats current insertion with appropriate periodic boundary conditions in the mu=3 direction. 2021-04-21 16:56:46 +01:00
Peter Boyle
86e11743ca set twists 2021-04-20 10:19:11 -04:00
f3f11b586f Tadpole sign now in front of forward hopping term to be consistent with previous implementation and analytic form. 2021-04-17 12:44:27 +01:00
8083e3f7e8 Sign factor for tadpole implementation corrected. 2021-04-15 11:14:31 +01:00
Christopher Kelly
e0e42873c1 Const correctness for Lattice::Replicate
Adapted GeneralEvenOddRationalRatio and Test_rhmc_EOWilsonRatio_doubleVsMixedPrec to recent changes that require passing in serial RNG

For GeneralEvenOddRationalRatio and TwoFlavourEvenOddRatio, broke refresh into two stages, the first of which generates the random field and the second that computes the pseudofermion field.
This allows derived classes to override the generation of the random field, for example in testing.

Test_dwf_gpforce now uses Gparity in x-direction and APBC in time as opposed to G-parity in time

Added Test_action_dwf_gparity2fvs1f that compares the DWF fermion action with the 2f and the 1f (doubled-lattice) implementations of Gparity
2021-04-14 16:41:27 -04:00
Peter Boyle
980e721f6e Update MetaData.h 2021-04-13 09:33:01 -04:00
364793154b Reverted checkerboard changes 2021-04-09 15:47:17 +01:00
3e2ae1e9af Added profiling messages to pick and set checkerboard functions 2021-04-08 16:58:47 +01:00
Henrique Rocha
d38ae2fd18 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-06 17:18:39 +01:00
Henrique Rocha
030e7754e4 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-06 17:16:13 +01:00
Peter Boyle
e2a0142d87 Merge pull request #348 from AndrewYongZhenNing/develop
Conserved Tadpole Implementation for Shamir Action Only
2021-04-06 10:49:00 -04:00
895244ecc3 Merge with upstream; implemented conserved tadpole for Shamir action. 2021-04-06 13:46:33 +01:00
addeb621a7 Implemented tadpole operator for Shamir action. 2021-04-06 13:45:37 +01:00
3b7fce1e76 Reverted checkerboard changes 2021-04-02 14:38:41 +01:00
4d15417f93 Merge remote-tracking branch 'upstream/develop' into develop 2021-04-01 18:28:15 +01:00
ab3c855f65 Merge branch 'develop' of https://github.com/Heinrich-BR/Grid into develop 2021-04-01 18:22:05 +01:00
92e2c517d8 Changed pick- and setCheckerboard to use accelerator_for 2021-04-01 18:21:19 +01:00
Peter Boyle
a7fb25adf6 Make Cshift fields static to avoid repeated reallocaate overhead 2021-03-29 21:44:14 +02:00
Peter Boyle
e947992957 Improved force terms 2021-03-29 20:04:06 +02:00
Peter Boyle
bb89a82a07 Staggered coalseced read 2021-03-29 20:01:15 +02:00
Christopher Kelly
0ff3bf6dc5 Merge branch 'develop' into feature/gparity_HMC 2021-03-22 15:33:13 -04:00
Christopher Kelly
351eab02ae Comment fix 2021-03-22 14:39:17 -04:00
Christoph Lehner
2bb374daea hip-friendly 2021-03-19 11:33:23 +01:00
Peter Boyle
8bdadbadac Cold start 2021-03-18 15:41:14 -04:00
Peter Boyle
15c50a7442 Explicit instantiate the template function 2021-03-18 15:40:42 -04:00
Peter Boyle
49b0af2c95 Update of tests to compile with the sRNG addition.
Audited the code conventions (again) with the CPS momentum denominator
and added anti periodic in time to the Test_mobius_force.cc and
tested the Test_dwf_gpforce.

Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi

with the same pdot and Udot as audited in the Integrator.h etc...

With full comments and sources for factors.
2021-03-18 09:10:02 -04:00
Peter Boyle
9c2b37218a sRNG parameter added 2021-03-18 06:24:11 -04:00
Peter Boyle
3c67d626ba Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 15:36:55 +01:00
Peter Boyle
51f506553c Read out the local ID once, and store 2021-03-12 15:33:04 +01:00
Peter Boyle
226be84937 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-12 09:31:50 -05:00
Peter Boyle
001814b442 updated to do list. Start adding DDHMC work items 2021-03-12 09:31:17 -05:00
Peter Boyle
db3ac67506 Update thread issue 2021-03-12 14:55:07 +01:00
Peter Boyle
da91a884ef NVCC versions found buggy added as guard 2021-03-11 23:54:53 +01:00
Peter Boyle
a71e6755e3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-11 22:43:06 +01:00
Peter Boyle
cd5891eecd Test that fails on Cuda 11.0 2021-03-11 22:34:28 +01:00
Peter Boyle
5bb7336f27 Merge pull request #347 from pjgeorg/fix-autotools-avx512
Fix inconsistent SIMD option AVX512

Thanks
2021-03-11 16:29:07 -05:00
Peter Boyle
ce1fc1f48a Possible fallback plan for Fionn's compiler bbug in nvcc 2021-03-11 22:20:53 +01:00
Peter Georg
82402c6a7c Add simd option SKL for ICC 2021-03-11 13:08:40 +01:00
Peter Georg
d9c4afe5b7 Fix inconsistent configure option AVX512
Before this change AVX512 enabled different instruction sets depending
on the compiler:

For Intel C++ Compiler Classic (ICC):
    AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL
    i.e. Intel Xeon Skylake and newer

For Intel ICX, gcc, clang:
    AVX512F, AVX512CD, AVX512ER, AVX512PF
    i.e. Intel Xeon Phi x200/x205 (KNL/KNM)

With this commit AVX512 now only enables the common instruction sets
supported by all CPUs supporting any AVX-512 instructions set:
AVX512F and AVX512CD (called COMMON-AVX512 by icc)
2021-03-11 12:58:49 +01:00
Peter Boyle
f786ff8d69 Extend test from Fionn, fails on A100 apparently 2021-03-10 14:32:06 -05:00
u61464
a651caed5f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-10 06:23:51 -08:00
u61464
0e21adb3f6 Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly.
Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp
evolves a central nervous systems.
2021-03-10 05:40:51 -08:00
Peter Boyle
58bf9b9e6d Clean up test 2021-03-10 02:45:22 +01:00
Peter Boyle
2146eebb65 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-09 04:31:46 +01:00
Peter Boyle
6a429ee6d3 2d loop hits Nvidia 16bit limit on large local vols 2021-03-09 04:31:10 +01:00
Peter Boyle
4d1ea15c79 More verbosity. The 16bit limit on Grid.y, Grid.z is annoying 2021-03-09 04:29:37 +01:00
Peter Boyle
a76cb005e0 Update Tensor_exp.h 2021-03-08 13:37:57 -05:00
Christoph Lehner
49ecbc81d4 Merge pull request #24 from ThomasWurm/feature/gpt
Put GlobalSum outside the slice loop in sliceSum
2021-03-08 16:01:47 +01:00
Thomas Wurm
9e5fb52eb9 Put GlobalSum outside the slice loop 2021-03-08 13:53:34 +01:00
Peter Boyle
a9604367c1 Merge pull request #336 from lehner/feature/gpt
Make ShmDims configurable; adjust GRID_MAX_SIMD to allow for 128 byte width on GPUs
2021-03-05 13:17:19 -05:00
Peter Boyle
d7065023cc Merge pull request #332 from mmphys/feature/mres_schur
Optional changes to Test_cayley_mres e.g. Schur solver
2021-03-05 12:47:07 -05:00
Peter Boyle
89d299ceec Merge pull request #333 from mmphys/bugfix/LatTransfer
Fix convertType for GPU in Lattice_transfer.h
2021-03-05 12:46:33 -05:00
Peter Boyle
e34eda66df Merge pull request #344 from felixerben/feature/XiToSigma
Feature/xi to sigma
2021-03-05 12:45:44 -05:00
Christoph Lehner
b24181aa4f Update Coordinate.h
Revert GRID_MAX_SIMD change
2021-03-05 16:56:58 +01:00
Peter Boyle
aa173e2998 Update README.md 2021-03-05 10:25:33 -05:00
7a19432e0b whitespace 2021-03-05 10:57:09 +00:00
9b15704290 tested and consitent 2021-03-05 10:42:32 +00:00
Michael Marshall
017f955b2d Merge branch 'develop' into feature/mres_schur
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:42:02 +00:00
Michael Marshall
f252d69eef Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Pass serial RNG around
  Sycl happier
2021-03-04 20:41:30 +00:00
3b06e4655e Merge branch 'develop' into feature/XiToSigma 2021-03-04 20:06:16 +00:00
d4b4de8f42 changes 2021-03-04 20:01:24 +00:00
Peter Boyle
c90beee774 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-03-03 23:50:29 +01:00
Peter Boyle
1eea9d73b9 Pass serial RNG around 2021-03-03 23:50:01 +01:00
Christopher Kelly
feee5ccde2 Added Gparity flavour Pauli matrix algebra and associated tensor types mirroring strategy used for Gamma matrices
Added test program for the above
2021-03-03 15:39:41 -05:00
u61464
679d1d22f7 Sycl happier 2021-03-03 11:21:43 -08:00
Michael Marshall
b2b5e0b98c Merge branch 'develop' into feature/mres_schur
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
  Better SIMD usage/coalescence
2021-03-03 16:15:12 +00:00
Michael Marshall
03e54722c1 Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path.
2021-03-03 16:13:23 +00:00
Peter Boyle
442336bd96 Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case.
Other cases to do. This now includes comms code path.
2021-03-02 14:50:51 +01:00
Christoph Lehner
9c9566b9c9 Merge pull request #23 from paboyle/develop
Sync
2021-03-01 12:33:51 +01:00
Michael Marshall
1059a81a3c Merge branch 'develop' into bugfix/LatTransfer
* develop:
  Better SIMD usage/coalescence
2021-02-27 00:21:36 +00:00
Peter Boyle
2e61556389 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-02-26 17:52:20 +01:00
Peter Boyle
f9b1f240f6 Better SIMD usage/coalescence 2021-02-26 17:51:41 +01:00
Michael Marshall
69f41469dd Merge branch 'develop' into bugfix/LatTransfer
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-25 09:19:17 +00:00
Michael Marshall
d620b303ff Merge branch 'develop' into feature/mres_schur
* develop: (26 commits)
  Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
  Correct misleading ac help string
  Enable performance counting in WilsonFermion like in others
  changed back A2AUtils warning
  changed if and accelerator_for - no runtime errors any more
  Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
  Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
  Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field
  MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error
  maxLocalNorm2()
  change back benchmark_ITT
  prettify
  Flop cout matches DiRAC-ITT-2020
  revert changes
  merge develop
  fixes
  weird bug in 2pt function...
  revert changes
  final version, tested on CPU and GPU
  bugfix
  ...
2021-02-24 18:07:27 +00:00
Peter Boyle
157fd1428d Merge pull request #342 from paboyle/feature/link-update-mask
Feature/link update mask
2021-02-24 11:29:52 -05:00
Christopher Kelly
c791cb2214 Merge branch 'develop' into feature/link-update-mask 2021-02-23 11:51:54 -05:00
Christopher Kelly
d5ab571a89 Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces
Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC
Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links
2021-02-23 11:49:56 -05:00
0ed800f6e4 merge develop 2021-02-23 14:54:46 +00:00
Peter Boyle
0a32183825 Merge pull request #335 from felixerben/gpu/baryons
Gpu/baryons
2021-02-23 09:30:16 -05:00
Peter Boyle
2cacfbde2a Merge pull request #341 from DanielRichtmann/fix/minor-things
Minor fixes
2021-02-22 09:28:50 -05:00
Daniel Richtmann
c073e62e0b Correct misleading ac help string 2021-02-22 15:25:44 +01:00
Daniel Richtmann
e3d019bc2f Enable performance counting in WilsonFermion like in others 2021-02-22 15:25:40 +01:00
7ae030f585 changed back A2AUtils warning 2021-02-18 13:24:50 +00:00
86b58d5aff changed if and accelerator_for - no runtime errors any more 2021-02-18 12:04:32 +00:00
Peter Boyle
26e8b9f4a5 Merge pull request #340 from mmphys/bugfix/config
Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu
2021-02-17 11:56:21 -05:00
Michael Marshall
35114c9e62 Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu 2021-02-17 13:24:15 +00:00
Christopher Kelly
e0f6a146d8 To DWF+I G-parity evolution code, added ability to specify number of MD steps in params and an optional usage mode that reads the config and checks the plaq/checksum agree then exits 2021-02-16 10:41:52 -05:00
Peter Boyle
dfd28a85c9 Merge pull request #339 from mmphys/bugfix/config
Optional rename PACKAGE_ to GRID_ in Grid/Config.h
2021-02-15 13:53:26 -05:00
Michael Marshall
a503332924 Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working.
Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons.
2021-02-14 21:27:54 +00:00
Christopher Kelly
daa095c519 Fixed an obscure but reproducible hang in the RHMC caused by the bounds check being activated by a random number that wasn't synchronized over the nodes
HMC now also reports the "L-infinity norm" of the impulse, aka the largest site norm
2021-02-09 12:55:46 -05:00
Christopher Kelly
c2676853ca Merge branch 'bugfix/maxnorm2' into feature/gparity_HMC 2021-02-08 12:17:33 -05:00
Peter Boyle
1ac13ec3a7 Merge pull request #338 from paboyle/bugfix/maxnorm2
Fixed compile issues with maxLocalNorm2 for non-scalar lattices
2021-02-08 12:08:11 -05:00
Christopher Kelly
55de69a569 Fixed compile issues with maxLocalNorm2 for non-scalar lattices
maxLocalNorm2 test now reuses the random field
2021-02-08 12:03:16 -05:00
Peter Boyle
eda9ab487b MADWF 5d source option for hadrons - look at Grid of source
Abort on GPU error
2021-02-08 10:47:22 -05:00
Christopher Kelly
6a824033f8 Merge branch 'develop' into feature/gparity_HMC 2021-02-08 09:31:49 -05:00
Christopher Kelly
cee6a37639 Added a logging tag for HMC
As the integrator logger is active by default the cmdline option to activate had no effect. Changed option to *de*activate on request ("NoIntegrator")
Cleaned up generating rational approxs in the general RHMC code
As the tolerance of the rational approx is not related to the CG tolerance, regenerating approxs for MD and MC if they differ only by the CG tolerance is not necessary; this has been fixed
In DWF+I Gparity evolution code, added cmdline options to check the rational approximations and compute the lowest/highest eigenvalues of M^dagM for RHMC tuning
In the above, changed the integrator layout to a much simpler one that completes much faster; may need additional tuning
2021-02-08 09:30:35 -05:00
Peter Boyle
cd99edcc5f maxLocalNorm2() 2021-02-04 18:25:49 -05:00
Christoph Lehner
4705aa541d Allow user to configure ShmDims via environment variables 2021-02-04 14:25:55 +01:00
Michael Marshall
3215d88a91 Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if 2021-02-03 15:17:03 +00:00
9b9a53f870 ... 2021-02-02 13:06:43 +00:00
Christoph Lehner
019ffe17d4 Allow for GPU vector width beyond 64 2021-02-02 11:32:23 +01:00
Christopher Kelly
6cc3ad110c Improved logging output for RHMC bounds checks
In GenericHMCRunner, exposed functionality for initializing gauge fields and RNG for external use
2021-01-29 12:35:00 -05:00
bc496dd844 change back benchmark_ITT 2021-01-28 14:29:56 +00:00
a673b6a54d prettify 2021-01-28 14:15:09 +00:00
1bf2e4d187 Merge branch 'develop' into gpu/baryons 2021-01-27 21:17:37 +00:00
Peter Boyle
96dd7a8fbd Flop cout matches DiRAC-ITT-2020 2021-01-27 21:14:52 +00:00
7905afa9f5 revert changes 2021-01-27 21:14:52 +00:00
712bb40650 merge develop 2021-01-27 21:14:52 +00:00
81d88d9f4d fixes 2021-01-27 21:09:51 +00:00
Christopher Kelly
e6c6f82c52 Gparity DWF+I HMC main program now has option to specify parameter file 2021-01-27 11:18:41 -05:00
Christopher Kelly
d10d0c4e7f Merge branch 'develop' into feature/gparity_HMC 2021-01-25 15:13:29 -05:00
Christopher Kelly
9c106d625a Added HMC main program designed to reproduce the 16^3x32x16 DWF+I ensembles with beta=2.13 and Gparity BCs 2021-01-25 15:07:44 -05:00
Christopher Kelly
6795bbca31 Generalized GeneralEvenOddRatioRationalPseudoFermionAction such that the multi-shift CG algorithm can be overridden by derived classes
Added a mixed-precision variant of GeneralEvenOddRatioRationalPseudoFermionAction and a verification test against double prec class
Fixed non-const reference used in passing RHMC approx to multishift classes
2021-01-25 14:22:31 -05:00
Michael Marshall
77063418da Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex<T>. This removes many warnings in Hadrons
Simplify the SFINAE syntax and correct convertType for iScalar
2021-01-25 15:09:36 +00:00
Michael Marshall
2983b6fdf6 Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided 2021-01-23 12:41:48 +00:00
Peter Boyle
69f1f04f74 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2021-01-21 21:39:59 -05:00
Peter Boyle
11a5fd09d6 Hot config 2021-01-21 21:39:41 -05:00
Peter Boyle
ff1fa98808 Fix for GPU conserveed current 2021-01-21 21:38:23 -05:00
Christopher Kelly
d161c2dc35 Improved formating of timing output in mixed-prec multishift
In test of mixed-prec multishift, added comparison against full double precision multishift both for timing and to cross-check the results
2021-01-20 15:42:06 -05:00
Christopher Kelly
7a06826cf1 Added option to NerscIO to disable exit on failing plaquette check allowing for circumvention of factor of 2 error in CPS-generated G-parity config headers
Adapted mixed-prec multi-shift test to new way to pass gauge BC directions and added cmdline option to perform the G-parity plaquette comparison with the corrected plaquette when loading config
2021-01-20 13:31:50 -05:00
Christopher Kelly
c3712b8e06 Merge branch 'develop' into feature/gparity_HMC 2021-01-20 11:48:52 -05:00
Christopher Kelly
901ee77b84 Mixed precision multishift test can now be performed with/without G-parity using cmdline check and can load a pregenerated configuration 2021-01-20 11:45:44 -05:00
df16202865 weird bug in 2pt function... 2021-01-19 19:25:27 +00:00
3ff7c2c02a Merge branch 'develop' into gpu/baryons 2021-01-19 12:34:13 +00:00
fc6d07897f revert changes 2021-01-19 12:32:48 +00:00
f9c8e5c8ef Merge branch 'develop' of github.com:paboyle/Grid into develop 2021-01-19 12:30:29 +00:00
8bfa0e74f8 final version, tested on CPU and GPU 2021-01-19 12:27:57 +00:00
9b73a937e7 bugfix 2021-01-18 18:57:05 +00:00
Peter Boyle
b0339bc5a4 Merge branch 'feature/conjugate-bc-dirs' into develop 2021-01-15 09:28:39 -05:00
Peter Boyle
3c23a947cc Fixed test for very much non-unit det 2021-01-15 09:16:02 -05:00
Peter Boyle
56111bb823 Merge branch 'develop' into feature/conjugate-bc-dirs 2021-01-14 21:01:22 -05:00
Peter Boyle
99445673f6 Gparity fix, and plaquette IO 2021-01-14 21:00:36 -05:00
Peter Boyle
97a59643f7 Red black coarse space 2021-01-14 20:49:13 -05:00
Peter Boyle
579595f547 Red black on coarse space 2021-01-14 20:48:35 -05:00
Peter Boyle
281ac5fc12 Red black support on coars 2021-01-14 20:48:08 -05:00
Peter Boyle
d8fa903b02 G5 on coarse spaces 2021-01-14 20:47:28 -05:00
Peter Boyle
eaff0f3aeb Gamma5 on coaree spaces 2021-01-14 20:46:58 -05:00
Peter Boyle
e8e20c01b2 Coarsened vector test 2021-01-14 20:46:21 -05:00
Peter Boyle
a4afc3ea2a Red black coarse space 2021-01-14 20:44:16 -05:00
fa12b9a329 bugfix 2021-01-13 10:04:17 +00:00
45fc7ded3a test for sum 2021-01-12 09:10:37 +00:00
74de2d9742 whitespace changes 2021-01-08 18:28:36 +00:00
e759367d42 tested and working 2021-01-08 18:04:50 +00:00
Christopher Kelly
1b84f59273 Added a mixed precision multishift algorithm for which the matrix multiplies are performed in single precision but the search directions are accumulated in double precision.
A reliable update step is performed at a tunable frequency to correct the residual. A final mixed-prec single-shift solve is performed on each pole to perform cleanup if necessary.
A test is provided to demonstrate the algorithm.
2021-01-06 12:24:44 -05:00
Christopher Kelly
1fb41a4300 Added copyLane function to Tensor_extract_merge.h which copies one lane of data from an input tensor object to a different lane of an output tensor object of potentially different precision
precisionChange lattice function now uses copyLane to remove need for temporary scalar objects, reducing register footprint and significantly improving performance
2021-01-06 11:50:56 -05:00
Christopher Kelly
287bac946f ConjugateGradientMixedPrec now stores final true residual and uses the precisionChange workspaces for improved efficiency 2021-01-06 09:50:41 -05:00
Christopher Kelly
80c14be65e Added core test to check precision change 2021-01-06 09:34:44 -05:00
Christopher Kelly
d7a2a4852d Reimplemented precisionChange to run on GPUs. A workspace containing the mapping table can be optionally precomputed and reused for improved performance. 2021-01-06 09:30:49 -05:00
Christopher Kelly
d185f2eaa7 OneFlavourEvenOddRatioRationalPseudoFermionAction now derives from GeneralEvenOddRatioRationalPseudoFermionAction, simply performs transcription of parameters 2020-12-23 16:26:10 -05:00
Christopher Kelly
813d4cd900 Added test program that ensures the generic checkerboarded RHMC (with parameters set appropriately) gives the same answer as the existing 1f code 2020-12-23 16:01:42 -05:00
Christopher Kelly
75c6c6b173 General RHMC pseudofermion action now allows for different rational approximations to be used in the MD and action evaluation 2020-12-23 11:19:26 -05:00
Christoph Lehner
299d0de066 Merge pull request #21 from paboyle/develop
Sync
2020-12-22 20:59:15 +01:00
Christopher Kelly
220ad5e3ee Added more verbose log output to GeneralEvenOddRatioRationalPseudoFermionAction
In GeneralEvenOddRatioRationalPseudoFermionAction, setting the bounds check frequency to 0 now disables the check
2020-12-22 11:08:22 -05:00
Christopher Kelly
ba5dc670a5 Reimplemented GparityWilsonImpl::InsertForce5D to run efficiently on GPUs
Swapped order of templated tensor code and c-number specializations in Tensor_outer.h to fix compile issue with type deduction on Summit
2020-12-22 10:10:07 -05:00
Peter Boyle
3fe75bc7cb Merge pull request #329 from nmeyer-ur/feature/a64fx-3
Revised dslash/dwf kernels for A64FX
2020-12-20 08:17:15 -05:00
Nils Meyer
45d49d8648 clean up 2020-12-19 03:35:18 +01:00
Nils Meyer
6013183361 removed Asm impls 2020-12-19 03:25:01 +01:00
Nils Meyer
4b882e8056 fixed lost bracket 2020-12-19 03:09:20 +01:00
Nils Meyer
3f9ae6e7e7 Merge branch 'develop' into feature/a64fx-3 2020-12-19 02:37:11 +01:00
Nils Meyer
909acd55cd vnum variant for prefetches 2020-12-19 02:00:22 +01:00
Nils Meyer
4dd9e39e0d up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 2020-12-19 00:54:31 +01:00
Christoph Lehner
b4c1317ab4 Merge pull request #22 from DanielRichtmann/feature/clover-access-specifier
Clover access specifier
2020-12-18 16:20:19 +01:00
Christopher Kelly
a0ca362690 Added an RHMC pseudofermion action, GeneralEvenOddRatioRationalPseudoFermionAction, that works for an arbitrary fractional power, not just a square root
Added a test evolution for the above, Test_rhmc_EOWilsonRatioPowQuarter, demonstrating conservation of Hamiltonian
Fixed HMC ignoring the MetropolisTest parameter of HMCparameters
2020-12-17 16:21:58 -05:00
Christopher Kelly
249b6e61ec For G-parity BCs the Nd-1 direction is now assumed to be the time direction and setting a twist in this direction will apply antiperiodic BCs
Added option to run Test_gparity with antiperiodic time BCs
2020-12-17 14:09:00 -05:00
f36d6f3923 compiles on GPU. 3pt still wrong!!!! 2020-12-17 17:04:08 +00:00
Peter Boyle
7adb253e25 Merge pull request #328 from mmphys/feature/mrespatch
Enable existing conserved current code for CUDA
2020-12-17 11:10:29 -05:00
808f1e0e8c merge develop 2020-12-15 16:33:29 +00:00
Michael Marshall
873519e960 Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration 2020-12-14 16:06:10 +00:00
Peter Boyle
9aec4a3c26 SYCL 2020-12-10 02:11:17 -08:00
Daniel Richtmann
c438118fd7 Change access specifier of clover fields in order to allow deriving classes to access these 2020-12-08 14:42:11 +01:00
Peter Boyle
70510d151b Merge pull request #327 from paboyle/feature/gparity_twist_GPU
Feature/gparity twist gpu
2020-12-07 12:02:20 -05:00
Christopher Kelly
9e7bacb5a4 Merge branch 'develop' into feature/gparity_twist_GPU 2020-12-07 11:55:39 -05:00
Christopher Kelly
2ef1fa66a8 Improved performance of G-parity kernel for GPUs by simplifying multLink implementation 2020-12-07 11:53:35 -05:00
Peter Boyle
cf76741ec6 Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 ) 2020-12-03 03:47:11 -08:00
Peter Boyle
497e7c1c40 Duplicate code 2020-12-02 17:55:30 -08:00
Peter Boyle
888eacd3b8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-24 21:46:33 -05:00
Peter Boyle
321f0f51b5 Project to SU(N) 2020-11-24 21:46:10 -05:00
Christoph Lehner
17ec9c5545 Merge pull request #20 from paboyle/develop
Sync
2020-11-24 12:20:43 +01:00
Peter Boyle
30ad9578a2 Merge branch 'lehner-feature/gpt' into develop 2020-11-24 06:10:24 -05:00
Peter Boyle
9dce101586 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into lehner-feature/gpt 2020-11-24 06:10:16 -05:00
Peter Boyle
97e264d0ff Christoph's changes 2020-11-23 15:46:11 +00:00
Peter Boyle
683a5e5bf5 Stencil use host vector for integera table on enable-shared=no and mirror it on device 2020-11-23 15:39:51 +00:00
Peter Boyle
d4861a362c Stencil use non-UVM memory for look up table on enable-shared=no 2020-11-23 15:38:49 +00:00
Peter Boyle
5ff3eae027 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-20 13:14:44 -05:00
Peter Boyle
147dc15d26 Update 2020-11-20 13:13:59 -05:00
Christoph Lehner
c61ea72949 Merge pull request #19 from paboyle/develop
Sync
2020-11-20 17:31:13 +01:00
Peter Boyle
86e8b9fe38 ALLOC_ALIGN removed 2020-11-20 17:07:16 +01:00
Peter Boyle
612e468889 Configurable ALLOC_ALIGN and ALLOC_CACHE 2020-11-20 16:48:28 +01:00
Christoph Lehner
4ea8d128c2 Merge pull request #18 from paboyle/develop
Sync
2020-11-20 15:36:50 +01:00
Peter Boyle
e49b7f2f88 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-19 19:24:41 +01:00
Peter Boyle
aace3d47b9 partial work in progress 2020-11-19 19:24:14 +01:00
Peter Boyle
d5049949a4 Starting to fix reunitarise 2020-11-19 19:23:41 +01:00
Peter Boyle
f1c7480e3c Warning remove 2020-11-19 19:23:03 +01:00
Peter Boyle
5adae5d6ff Unused variable remove 2020-11-19 19:22:12 +01:00
Peter Boyle
a8412ace05 Merge pull request #317 from i-kanamori/develop
adding an error check for input: Parameters.StartingType
2020-11-18 23:09:40 -05:00
Peter Boyle
9fd1c2ad4b Merge pull request #325 from DanielRichtmann/feature/threaded-clover-inversion
Threaded clover term inversion
2020-11-18 23:08:37 -05:00
Peter Boyle
4cf3575353 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-18 03:07:36 +00:00
Peter Boyle
804a810d68 Wildcard mismatch 2020-11-18 03:06:53 +00:00
Peter Boyle
8fcb392e24 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-17 04:51:31 -08:00
Peter Boyle
dd8d70eeff Build without LIME 2020-11-17 04:41:15 -08:00
Peter Boyle
aa8aba6543 --shm-force-mpi 2020-11-16 20:15:50 -05:00
Peter Boyle
13df14f96e Switch off SHM paths with --disable-shm 2020-11-16 18:07:15 -05:00
Peter Boyle
3aab983760 Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) 2020-11-16 17:13:58 +01:00
Peter Boyle
9c4dcc5ea3 Merge branch 'master' into develop 2020-11-16 16:34:57 +01:00
Peter Boyle
a1063ddbb9 Update options and simplify 2020-11-13 04:11:03 +01:00
Peter Boyle
18ef8056ec Hide Shared Memory 2020-11-13 04:10:40 +01:00
Peter Boyle
1c673977fa Must ask for COMMMS_THREADS 2020-11-13 03:59:36 +01:00
Peter Boyle
e9bc748828 Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel 2020-11-13 03:58:34 +01:00
Peter Boyle
f48156529b Work on 2,2,2,8 ranks 2020-11-13 03:57:58 +01:00
Peter Boyle
d05ce01809 TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED 2020-11-13 03:52:19 +01:00
Peter Boyle
cf23eff60e Device to Device, Memset, cannot assume UVM == Communicable 2020-11-13 03:51:08 +01:00
Peter Boyle
6e313575be Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac 2020-11-13 03:50:16 +01:00
Peter Boyle
b13d1f7238 TOFU compat flag to help Isaaku 2020-11-13 03:49:44 +01:00
Peter Boyle
b5e7945dd9 Option for host or device Cshift implementation 2020-11-13 01:38:54 +01:00
Peter Boyle
7535566f54 Option for bounce through the SHM buffer 2020-11-12 22:54:27 +01:00
Peter Boyle
50b808ab33 Configure option between host and device 2020-11-12 22:28:12 +01:00
Peter Boyle
f16c2665f5 Host memory explict 2020-11-12 20:29:58 +01:00
Peter Boyle
41e28015ae Volume divisible guarantee 2020-11-07 13:32:16 +01:00
3594ce877b speedup in Sigma-to-nucleon 2020-11-03 20:04:30 +00:00
9bae6b889a speedup in Sigma-to-nucleon 2020-11-03 20:03:09 +00:00
4014dfd5b9 first tested version 2020-11-03 16:13:08 +00:00
67023c334b bugfix 2020-11-03 13:07:37 +00:00
a3de7026c8 bugfix 2020-11-03 12:51:50 +00:00
ee11678b1f added Xi-to-Sigma rare decays 2020-11-03 12:41:35 +00:00
Peter Boyle
a0ccbb3bd6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-11-01 01:16:35 +00:00
Peter Boyle
5eeabaa2bb HIP fix 2020-11-01 01:16:01 +00:00
Peter Boyle
00d0d6d008 Hip Free managed 2020-10-31 18:14:31 -04:00
Peter Boyle
537a9f7030 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-31 18:13:30 -04:00
Peter Boyle
cc9c993f74 Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt 2020-10-31 18:12:47 -04:00
Peter Boyle
d10422ded8 Test project on group 2020-10-31 18:12:30 -04:00
Peter Boyle
f313565a3c HiP compile 2020-10-31 12:12:40 +00:00
Daniel Richtmann
b3881d2636 Thread inversion of clover term 2020-10-30 16:18:58 +01:00
61d5860b46 Merge pull request #318 from rrhodgson/feature/BaryonSpinMat
Added untraced baryon contraction code
2020-10-28 18:39:59 +00:00
52d17987dc BaryonUtils.h updated debug output 2020-10-23 11:41:08 +01:00
19d8bba97d BaryonUtils function naming change 2020-10-21 11:58:53 +01:00
463d72d322 Added untraced baryon contraction code 2020-10-19 16:13:28 +01:00
d060341168 add an error check for Parameters.StartingType 2020-10-16 21:39:17 +09:00
c772bcd514 Merge https://github.com/paboyle/Grid into develop 2020-10-16 20:30:32 +09:00
Peter Boyle
3362f8dfa0 happy compile 2020-10-14 22:59:41 -04:00
Peter Boyle
bf3c9857e0 Closure changes 2020-10-14 21:37:14 -04:00
Peter Boyle
a88b3ceca5 Closure cases 2020-10-14 21:33:51 -04:00
Peter Boyle
aa135412f5 toComplex, toReal 2020-10-13 22:25:01 -04:00
Peter Boyle
9945399e60 Reaality issues fix by drop from ET 2020-10-13 22:24:32 -04:00
Peter Boyle
5eeffa49e8 Reality forced included 2020-10-13 22:23:57 -04:00
Peter Boyle
3f06209720 Pretty print 2020-10-13 22:18:51 -04:00
Peter Boyle
12e239dd9f Merge branch 'release/dirac-ITT-2020' 2020-10-13 13:38:29 -04:00
Peter Boyle
af2301afbb Merge pull request #312 from i-kanamori/debug_512
add reordring of random number generators in IO
2020-10-13 11:42:12 -04:00
Peter Boyle
f98856a26f Merge pull request #314 from smangham/issue_readme_precision
Fix for deprecated configure options in documentation (issue #313)
2020-10-13 11:41:38 -04:00
Sam Mangham
d55cc5b380 Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision. 2020-10-12 12:33:13 +01:00
c2b688abc9 Benchmark_IO: reducing max local volume to 32^4 2020-10-10 16:52:56 +01:00
b0d61b9687 Benchmark_IO cleaner output 2020-10-09 21:46:45 +01:00
5f893bf9af Benchmark_IO procurement sizes 2020-10-09 21:31:59 +01:00
0e17bd6597 I/O benchmark cleanup 2020-10-09 20:29:57 +01:00
22caa158cc multi-pass I/O benchmark, with statistic and robustness summary 2020-10-09 20:29:40 +01:00
b24a504d7c hook to access last parallel I/O performance measurement 2020-10-09 20:28:54 +01:00
Peter Boyle
992ef6e9fc more runtime 2020-10-08 22:19:20 -04:00
Peter Boyle
f32a320bc3 Single prec benchmark in double prec compile 2020-10-08 19:52:08 -04:00
Peter Boyle
5f0fe029d2 Improve meemory benchmarks for GPU (avoid host mem ping pong) 2020-10-08 19:51:28 -04:00
6b1486e89b fixing number of colours defaulting to 4 in most cases 2020-10-08 16:31:24 +01:00
Peter Boyle
3f9c427a3a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-10-07 13:12:57 -04:00
Peter Boyle
d201277652 Expose Nc as a compile time configure option.
Remove precision option
2020-10-07 13:07:00 -04:00
fdda7cf9cf Merge branch 'feature/benchmark-io-update' into develop 2020-10-07 15:57:53 +01:00
e22d30f715 Merge branch 'develop' into feature/benchmark-io-update 2020-10-07 15:56:39 +01:00
1ba25a0d8c more I/O benchmark code cleaning 2020-10-07 15:38:41 +01:00
9ba3647bdf script to convert I/O benchmark logs to CSV 2020-10-07 15:35:03 +01:00
5ee832f738 I/O benchmark code cleaning 2020-10-07 15:31:51 +01:00
467deee46f Merge branch 'debug_512' into develop 2020-10-07 15:18:44 +09:00
Peter Boyle
35a69a5133 SU4 x SU4 2020-10-06 21:48:35 -04:00
e9c5a271a8 fixing potential issues with log alignment and timer I/O 2020-10-06 17:58:16 +01:00
acac2d6938 standard C/C++ I/O in benchmark 2020-10-06 17:57:00 +01:00
97db2b8d20 add reordring of random number generator in IO 2020-10-06 17:25:59 +09:00
Christoph Lehner
80fd6ab407 Merge pull request #17 from paboyle/develop
sync upstream
2020-10-06 09:01:39 +02:00
Christoph Lehner
5534921bee Merge pull request #16 from DanielRichtmann/feature/gpt-coarsenedmatrix
Enable checkerboard operations for CoarsenedMatrix
2020-10-01 10:55:13 +02:00
Peter Boyle
ace9cd64bb dpcpp happy 2020-09-29 08:03:46 -07:00
Peter Boyle
a3e2aeb603 dpcpp options happiness 2020-09-29 06:50:10 -07:00
Peter Boyle
049dd25785 Revert accidental commit thanks michael 2020-09-23 04:13:50 -04:00
Peter Boyle
d43d372294 Merge pull request #311 from mmphys/bugfix/MPIasynch
Asynchronous calls removed - reflect this in Communicator_none.cc
2020-09-22 10:41:48 -04:00
Michael Marshall
b71a081cba Asynchronous calls removed - reflect this in Communicator_none.cc
(Opportunistic doc update - OpenMP support on Mac OS)
2020-09-21 09:33:23 +01:00
Peter Boyle
c48909590b MPI asynch call removal 2020-09-17 20:47:32 +01:00
Peter Boyle
446ef40570 HIP IPC 2020-09-17 20:31:46 +01:00
Peter Boyle
81441e98f4 HIP runs sensible 2020-09-16 03:35:03 +01:00
Peter Boyle
ecd3f890f5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-16 02:30:14 +01:00
Peter Boyle
1c881ce23c HIP does not like half2 visible members x and y so must define own Half2 2020-09-16 02:28:33 +01:00
Peter Boyle
dacbbdd051 Hip Happy Birthday 2020-09-16 00:37:02 +01:00
Peter Boyle
2859955a03 HIP requires "inline" 2020-09-16 00:36:13 +01:00
Peter Boyle
cc220abd1d inline for HIP 2020-09-16 00:35:38 +01:00
Peter Boyle
d1c0c0197e HipCC requires inline on definition 2020-09-16 00:35:06 +01:00
Peter Boyle
fd9424ef27 innlines required to make HIP happy 2020-09-16 00:34:32 +01:00
Peter Boyle
a5c35c4024 Make HIP / Vega happy 2020-09-16 00:33:53 +01:00
Peter Boyle
e03b64dc06 HIP default flaags to work on ROCM 2020-09-16 00:33:09 +01:00
Peter Boyle
4677c40195 HIP improvements 2020-09-16 00:32:27 +01:00
Peter Boyle
288c615782 Hip improvements 2020-09-16 00:31:50 +01:00
Peter Boyle
48e81cf6f8 Hip Pragmas 2020-09-16 00:31:03 +01:00
Christoph Lehner
5cffa05c7e remove slab allocator file 2020-09-13 14:06:25 -04:00
Christoph Lehner
d50a2164d7 remove slab allocator 2020-09-13 14:06:06 -04:00
Christoph Lehner
32ff766dbd fix evict scheme, slab alloc 2020-09-13 14:02:53 -04:00
Christoph Lehner
01652d8cfe SlabAllocator 2020-09-13 05:56:02 -04:00
Daniel Richtmann
4d2dc7ba03 Enable even-odd for CoarsenedMatrix 2020-09-11 20:32:02 +02:00
Christoph Lehner
51d1beb1f3 Merge pull request #15 from paboyle/develop
Sync with upstream
2020-09-07 14:20:33 +02:00
Peter Boyle
65b724bb5f 2 level hddcr 2020-09-03 21:46:43 -04:00
Peter Boyle
6dbd117aa5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-03 20:30:49 -04:00
Peter Boyle
198b29f618 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-09-03 20:29:54 -04:00
Peter Boyle
a8309638d4 UVM check in MPI calls 2020-09-03 20:29:26 -04:00
Peter Boyle
f98a4e880e Merge pull request #310 from kostrzewa/accelerator_vector_stream_op_no_backspace
do not use backspace in AcceleratorVector (Coordinate) output stream operator
2020-09-03 20:24:59 -04:00
Peter Boyle
8244caff25 Remove the asynchronous non-Stencil calls. 2020-09-03 18:52:55 -04:00
Peter Boyle
bcd7895362 Include cuda.h 2020-09-03 15:49:13 -04:00
Peter Boyle
85b1c5df39 A never hit case that is not 100% confident is asserted for safety 2020-09-03 15:48:16 -04:00
Peter Boyle
b4255140d6 Stale data member eliminated 2020-09-03 15:47:46 -04:00
Peter Boyle
0c3095e173 Comms buffers to device memory 2020-09-03 15:45:35 -04:00
Peter Boyle
d3ce60713d UVM, Device and Lattice/aligned allocators 2020-09-03 15:44:13 -04:00
Peter Boyle
eac1f08b7b Close expressions passed as an argument 2020-09-01 15:30:33 -04:00
Peter Boyle
1654c4f3c0 Closure improved 2020-09-01 15:29:45 -04:00
Peter Boyle
8807d998bc closure improved 2020-09-01 15:29:11 -04:00
Peter Boyle
5791021dcd Speed up Cshift more with coalesced 2020-09-01 15:28:15 -04:00
Peter Boyle
c273fb051c Peek poke laattice 2020-09-01 15:27:59 -04:00
Peter Boyle
c545530170 little worry large Nbasis doesnt compile GPU 2020-09-01 00:14:33 -04:00
Peter Boyle
d982a5b6d5 Fix coaarsened 2020-09-01 00:14:04 -04:00
Peter Boyle
15ca8637f3 No norms in HermOp 2020-09-01 00:13:32 -04:00
Peter Boyle
cbc995b74c Made better interface 2020-09-01 00:12:54 -04:00
Peter Boyle
8b74174d74 Eigen tensor serialisatiino happy undeer GPU. Regret agreeing to let us couple Eigen types to Grid IO 2020-09-01 00:03:26 -04:00
Peter Boyle
e21fef17df real and imag part not in ET 2020-08-31 23:56:26 -04:00
Peter Boyle
3d27708f07 Basic where test 2020-08-31 23:55:49 -04:00
Peter Boyle
b918744184 Prettificatoin 2020-08-31 23:54:46 -04:00
Peter Boyle
7d14a3c086 Where working 2020-08-31 23:53:46 -04:00
Peter Boyle
e14a84317d GPU math unary calls 2020-08-31 23:50:49 -04:00
Peter Boyle
6c31b99f1f I knew coupling Eigen Tensor to Grid serialisation was a bad iddea.
Now the complex is different on GPU creates probblems
2020-08-31 23:49:19 -04:00
Peter Boyle
9522dcd611 Remove dead commented ouot coode 2020-08-31 23:40:29 -04:00
Peter Boyle
ed469898dc coalesced ET expressions 2020-08-31 23:38:40 -04:00
Peter Boyle
1eee94a809 Sorting real/im in read coalesced GPU ET 2020-08-31 23:36:49 -04:00
Bartosz Kostrzewa
54523369a3 do not use backspace in Coordinate output stream operator 2020-08-31 19:39:36 +02:00
Peter Boyle
a98c91c2a5 Merge pull request #309 from kostrzewa/format_benchmark_wilson_sweep
Format benchmark wilson sweep
2020-08-31 12:43:46 -04:00
Bartosz Kostrzewa
a9b92867a8 use tabulator 2020-08-31 18:41:17 +02:00
Bartosz Kostrzewa
65920faeba correct formatting of Benchmark_wilson_sweep output 2020-08-31 18:39:27 +02:00
Christoph Lehner
249e2db87d Merge pull request #14 from DanielRichtmann/feature/gpt-coarsenedmatrix
Expose more functions in CMat
2020-08-27 15:18:56 +02:00
Daniel Richtmann
cf3535d16e Expose more functions in CMat 2020-08-27 14:06:48 +02:00
Christoph Lehner
d61ee817f4 Merge pull request #13 from DanielRichtmann/feature/gpt-coarsenedmatrix
Changes needed for GPT MG
2020-08-27 12:11:06 +02:00
Peter Boyle
3448b7387c Almost there to coalesced ET 2020-08-26 17:04:49 -04:00
Peter Boyle
47b89d2739 Pragma protection improvementt 2020-08-26 17:04:27 -04:00
Christoph Lehner
2a75516330 state MPI/SLURM message only on world_rank zero 2020-08-26 12:34:17 -04:00
Daniel Richtmann
b2087f14c4 Fix CoarsenedMatrix regarding illegal memory accesses
Need a reference to geom since the lambda copies the this pointer which points to host memory, see
- https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture
- https://devblogs.nvidia.com/new-compiler-features-cuda-8/
2020-08-24 17:46:47 +02:00
Daniel Richtmann
dd1ba266b2 Fix mapping between dir + disp and point in CMat 2020-08-24 17:46:46 +02:00
Daniel Richtmann
1292d59563 Add a typedef + broaden interface of CMat 2020-08-24 17:46:45 +02:00
Christoph Lehner
9877ed9bf8 Merge pull request #12 from paboyle/develop
Sync
2020-08-22 16:35:35 +02:00
Christoph Lehner
f0dc0f3621 fix compile issue on Qpace3 2020-08-22 13:57:33 +02:00
Peter Boyle
1efe30d6cc SLurm stop nodes using same GPU 2020-08-21 02:02:53 +02:00
Peter Boyle
0b787e9fe0 Avoid namespaec collision to make gcc happy 2020-08-20 22:23:29 +02:00
Peter Boyle
37ec4b241c Default thread count sensible 2020-08-20 22:12:31 +02:00
Christoph Lehner
63b0a19f37 Merge pull request #11 from paboyle/develop
Sync
2020-08-20 20:53:39 +02:00
Peter Boyle
90ea7dfa99 Accelerator loops for device resident comms buf 2020-08-19 22:40:44 +02:00
Peter Boyle
f866d7c33e Merge pull request #307 from lehner/feature/gpt
Merged Nils's A64FX and minor fixes (MemoryManager::InitMessage, Tensor_index zeroit, ...)
2020-08-18 23:27:21 -04:00
Christoph Lehner
542bdef198 cleanup comments 2020-08-14 18:39:44 +02:00
Christoph Lehner
06007db3d9 true shm_none implementation with GPUs that disables the use of device shared memory for the stencils 2020-08-14 18:37:00 +02:00
Christoph Lehner
12e6059a70 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2020-08-13 16:16:52 +02:00
Christoph Lehner
dbaa24ebf6 further GPU memory access fixes (with this GPT passes all single-rank tests on non-summit GPUs) 2020-08-13 16:14:15 +02:00
Peter Boyle
3276aa67dc Update 2020-08-12 14:15:53 -04:00
Christoph Lehner
3b30b9f0c0 Merge branch 'feature/gpt' of https://github.com/lehner/Grid into feature/gpt 2020-08-06 16:59:17 +02:00
Christoph Lehner
69db4816f7 fix variable capture in Scatter_plane_merge on accelerators 2020-08-06 16:57:16 +02:00
Christoph Lehner
3abe09025a when using SHM_NONE allow multiple ranks per node but without using shared memory 2020-08-06 14:42:38 +02:00
Christoph Lehner
e33878e0de Trigger re-run of CI 2020-08-06 11:50:24 +02:00
Christoph Lehner
27b4fbf3f0 assert for forbidden code path and fix check for faster CPU codepath in basisRotate 2020-08-03 07:57:33 -04:00
Christoph Lehner
968a90633a Zero -> zeroit in Tensor_index 2020-07-31 02:07:17 -04:00
Christoph Lehner
6365a89ba3 create separate InitMessage for MemoryManager that can be called after communicator setup 2020-07-30 07:25:05 -04:00
Christoph Lehner
ddbb008694 Merge pull request #10 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-07-30 13:12:09 +02:00
Christoph Lehner
7997e0a449 Merge branch 'feature/gpt' into feature/gpt-sycl 2020-07-30 13:11:31 +02:00
Christoph Lehner
197612bc7a fast cpu basisRotate and other small cleanups 2020-07-30 07:08:54 -04:00
Christoph Lehner
0e88bf4bff remove Nils's default pragma 2020-07-29 10:24:35 -04:00
Christoph Lehner
3e64d78469 include versions.h again and add back asserts in Test_simd 2020-07-29 10:18:05 -04:00
Christoph Lehner
2004611def Merge pull request #9 from nmeyer-ur/feature/a64fx-2
Feature/a64fx 2
2020-07-29 14:54:20 +02:00
Christoph Lehner
a2868c96a4 Merge pull request #8 from paboyle/develop
Doc recompile
2020-07-29 14:10:07 +02:00
Peter Boyle
7cf7f11e1a Doc recompile 2020-07-22 14:44:11 -04:00
nmeyer-ur
ea7f8fda5e fix typo 2020-07-22 09:34:05 +02:00
nmeyer-ur
906b78811b exit in Init when using --comms-overlap 2020-07-22 08:57:01 +02:00
Christoph Lehner
97703b181b Merge pull request #7 from paboyle/develop
Merge current develop
2020-07-12 16:24:53 +02:00
nmeyer-ur
d9474c6cb6 compiler-independent build using --enable-simd=A64FX 2020-07-09 10:07:02 +02:00
nmeyer-ur
bbd145382b enable --enable-simd=A64FX in configure 2020-07-08 12:43:51 +02:00
nmeyer-ur
1b08cb7300 Merge branch 'develop' into feature/a64fx-2 2020-07-08 08:18:18 +02:00
nmeyer-ur
337d9dc043 move barrier in Benchmark_wilson 2020-07-08 08:13:40 +02:00
nmeyer-ur
8726e94ea7 merge upstream develop 2020-07-07 20:26:47 +02:00
nmeyer-ur
67db4993c2 reset head, update SVE readme 2020-07-07 19:54:52 +02:00
f1f655d92b Merge pull request #304 from Heinrich-BR/develop
ScalarImpl.h updates
2020-07-06 10:16:03 +01:00
43334e88c3 Tiny change in a comment for clarity 2020-07-04 16:11:16 +01:00
4f1e66b044 Fixed HMC SU(N) integrator which was causing fields to leave Lie Algebra manifold for N>2 2020-07-04 03:53:06 +01:00
nmeyer-ur
fd3c8b0e85 correct build instructions qp4 2020-07-01 09:00:38 +02:00
nmeyer-ur
1635c263ee disable TOFU by default 2020-06-30 19:27:08 +02:00
64fe5b21b4 Merge pull request #298 from rrhodgson/feature/baryon
Update baryon 2pt and add 3pt function
2020-06-29 18:45:00 +01:00
Peter Boyle
ee9889821d Runs through to coarse space solve 2020-06-29 12:59:52 -04:00
eb470aa6dc Update to baryon and added comments/fix whitespace 2020-06-29 09:43:01 +01:00
77af9a3ddc Baryon revert sign 2020-06-26 10:08:42 +01:00
102089798c BaryonUtils: update to autoView 2020-06-25 16:41:58 +01:00
39cea8b5a7 Merge branch 'develop' into feature/baryon 2020-06-25 16:24:07 +01:00
a65f66d2db Merge branch 'feature/baryon3pt' into feature/baryon 2020-06-25 16:20:59 +01:00
Peter Boyle
936c5ecf69 Reduction GPU no compile fix 2020-06-24 17:28:31 -04:00
Peter Boyle
22cfbdbbb3 Boost precision in inner products in single 2020-06-24 12:52:31 -04:00
Peter Boyle
093d1ee21b Force initial values 2020-06-24 08:54:49 -04:00
Peter Boyle
d6ba2581ce Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-06-24 08:25:08 -04:00
Peter Boyle
577c064184 Memory manager initialise earlier 2020-06-24 08:24:38 -04:00
Peter Boyle
2ff1fa6fad UVM used shared for CPU alloccations andd ddont migrate 2020-06-23 22:14:56 -04:00
Peter Boyle
70be1bd8be Adding code under development 2020-06-23 10:24:21 -04:00
4ef50ba31f Baryon speedup 2020-06-23 11:44:20 +01:00
3e97a26f90 BaryonGamm3pt threads -> accelerator 2020-06-23 11:35:32 +01:00
599f28f6ef Baryon bug fixes 2020-06-23 11:10:26 +01:00
Peter Boyle
c48da35921 Memory Vector UVM and Lattice alignedAllocator separate 2020-06-22 20:21:53 -04:00
Peter Boyle
6c5fa8dcd8 Aligned allocate on CPU put through this interface 2020-06-20 14:34:29 -04:00
Peter Boyle
0d2f913a1a String.h for linux 2020-06-20 09:37:31 -04:00
Christoph Lehner
5b117865b2 Merge pull request #6 from paboyle/sycl
Sycl
2020-06-20 09:44:44 +02:00
Peter Boyle
1a74816c25 Hopeefully fixed 2020-06-19 17:50:52 -04:00
Peter Boyle
73de335256 Merge branch 'develop' into sycl 2020-06-19 17:44:16 -04:00
Peter Boyle
228fd450ce Typo fix (excusee - my keyboard is starting to break) 2020-06-19 17:36:05 -04:00
Peter Boyle
b949cf6b12 PeekLocal needs a view to keep thread safe.
ALLOCATION_CACHEE reenable
2020-06-19 17:13:27 -04:00
Peter Boyle
11bc1aeadc TThread count defaultt to fastest 2020-06-19 14:30:35 -04:00
Peter Boyle
66005929af Set up the cache size on all ranks 2020-06-19 12:50:54 -04:00
Christoph Lehner
05bbc49a99 Edge case in GetShmDim check 2020-06-19 12:01:23 -04:00
Peter Boyle
ff7c847735 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-19 01:22:16 -04:00
Peter Boyle
1aa988b2af Comms overlap fix UVM case 2020-06-19 01:21:14 -04:00
Peter Boyle
edf17708a8 Range improvement 2020-06-18 22:41:06 -04:00
Christoph Lehner
81a8209749 ConvertType for blockInnerProduct 2020-06-18 11:53:21 -04:00
nmeyer-ur
a87e45ba25 SVE readme update 2020-06-18 11:23:08 +02:00
nmeyer-ur
465856331a switch back to serialized; wrong results on single too 2020-06-15 15:39:39 +02:00
nmeyer-ur
cc958aa9ed switch back to standard MPI_init due to wrong results in Benchmark_wilson using comms-overlap 2020-06-15 14:21:38 +02:00
Peter Boyle
f46f029dbb Merge pull request #292 from lehner/feature/gpt-sycl
Catch edge case in SharedMemoryMPI::GetShmDims; Change default units …
2020-06-14 13:43:27 -04:00
Christoph Lehner
3dccd7aa2c Catch edge case in SharedMemoryMPI::GetShmDims; Change default units to consistent MB in init args; Want last element not past last element in MemoryManagerCache.cc 2020-06-14 13:26:01 -04:00
nmeyer-ur
a25e4b3d0c pred 32/64 for float/double instead of 8 in VLA patch 2020-06-13 14:44:37 +02:00
nmeyer-ur
d1210ca12a switch to double/float instead of float64_t/float32_t in VLA patch 2020-06-13 13:59:32 +02:00
nmeyer-ur
36ea0e222a type traits for ComplexF/D in VLA patch; cosmetics in VLS intrinsics 2020-06-13 13:42:35 +02:00
Peter Boyle
65e6e7da6f Merge pull request #291 from lehner/feature/gpt-sycl
Feature/gpt sycl
2020-06-12 20:42:32 -04:00
Christoph Lehner
b5e87e8d97 summit compile fixes 2020-06-12 18:16:12 -04:00
Christoph Lehner
5f5807d60a cleanup 2020-06-12 14:48:23 -04:00
nmeyer-ur
92281ec22d add 3 op Mult for VLA 2020-06-12 18:49:05 +02:00
nmeyer-ur
87266ce099 comment out fcmla in vector types: need also MultAddReal 2020-06-12 18:37:19 +02:00
nmeyer-ur
2a23f133e8 reenable fcmla for VLA 2020-06-12 17:30:38 +02:00
nmeyer-ur
8dbf790f62 correct tbl2 for sp 2020-06-12 17:12:34 +02:00
nmeyer-ur
2402b4940e vec_imm in float 2020-06-12 15:17:38 +02:00
nmeyer-ur
2111052fbe apply VLA patch for memcpy reduction suggested by Arm, CAS-162542-D6W7Z7 2020-06-12 14:49:19 +02:00
Christoph Lehner
7974acff54 merged sycl to feature-gpt 2020-06-12 06:49:38 -04:00
f0d17d2b49 Added Baryon3pt code 2020-06-12 11:35:52 +01:00
244c003a1b Updated Baryon code 2020-06-12 11:00:25 +01:00
0174f5f742 look for librt when using shm=shmopen 2020-06-11 16:50:43 +01:00
Peter Boyle
32b2b59be4 Offload 2020-06-10 20:36:26 -04:00
Peter Boyle
86bb0cc24b Keep on GPU 2020-06-10 20:00:00 -04:00
Peter Boyle
84c19587e7 Offload 2020-06-10 19:59:31 -04:00
Peter Boyle
237ce92540 Offload loops 2020-06-10 19:59:11 -04:00
Peter Boyle
a7ffc61e82 acceleratorSIMTlane() 2020-06-10 19:58:33 -04:00
Peter Boyle
fd97f64612 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-10 12:58:13 -04:00
Peter Boyle
8720aecb80 Offload more loops 2020-06-10 12:57:55 -04:00
Peter Boyle
cdf0a04fc5 Merge branch 'develop' into sycl 2020-06-09 04:00:12 -04:00
Peter Boyle
616d3dd737 CCommpile updates 2020-06-08 18:57:41 -04:00
Peter Boyle
8b066baca8 Implement transient mechanism 2020-06-08 18:28:53 -04:00
Peter Boyle
e97f3688db Fix the HMC issue - kernel was launchnig asynchronously 2020-06-08 17:01:15 -04:00
nmeyer-ur
433766ac62 revert Add/SubTimesI and prefetching in stencil
This reverts commit 9b2699226c.
2020-06-08 12:02:53 +02:00
nmeyer-ur
93a37c8f68 test prefetch to L2 in stencil 2020-06-08 09:39:50 +02:00
Peter Boyle
89a1e78390 Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 23:20:37 -04:00
Peter Boyle
ffbb3fc02c Merge pull request #287 from felixerben/baryon-cleaner
slightly cleaner baryon 2pt code
2020-06-05 22:54:52 -04:00
Peter Boyle
5a73ef3647 Minor tweak to compile 2020-06-05 21:50:15 -04:00
Peter Boyle
87e5d2f4b7 Merge branch 'sycl' of https://www.github.com/paboyle/Grid into sycl 2020-06-05 17:32:21 -07:00
Peter Boyle
d720f10758 Liink error fix 2020-06-05 17:29:20 -07:00
Peter Boyle
14fcd0912a Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 19:14:17 -04:00
Peter Boyle
3111c0bd4f Single precisiono hardwire 2020-06-05 19:13:27 -04:00
Peter Boyle
e03064490e Merge branch 'sycl' of https://github.com/paboyle/Grid into sycl 2020-06-05 18:53:39 -04:00
Peter Boyle
1a4c8c3387 Global edit with change to View usage. autoView() creates a wrapper object that closes the view when scope closes. 2020-06-05 18:52:35 -04:00
Peter Boyle
2b1e259441 Decode of SYCL devices fix 2020-06-04 17:16:55 -07:00
Peter Boyle
f39c2a240b Priintinig and device memory size detection 2020-06-04 14:58:03 -04:00
Peter Boyle
0d95805cde Print improvement 2020-06-03 22:50:32 -04:00
Peter Boyle
f67830587f Accelerator loop use 2020-06-03 22:50:09 -04:00
Peter Boyle
6bf7f839ff Better printing and logging 2020-06-03 09:28:57 -04:00
Peter Boyle
e3147881a9 Cache scheme 2020-06-03 09:23:48 -04:00
nmeyer-ur
9872c76825 introduce AddTimesI and SubTimesI; slight benefit in operators, but < 1%; breaks all other impls 2020-06-03 15:20:13 +02:00
Peter Boyle
fb559614ad Initialise meemory manager 2020-06-03 09:12:47 -04:00
Peter Boyle
e93e12b6a4 More verbose SYCL setup 2020-06-03 09:12:11 -04:00
Peter Boyle
0c3112cd94 Use view mechanism 2020-06-03 09:11:51 -04:00
Peter Boyle
8cfd5d2639 Need lattice view 2020-06-03 09:11:28 -04:00
Peter Boyle
1c9f20b15e Views must be closed 2020-06-03 09:10:29 -04:00
Peter Boyle
32237895bd Reorg memory manager for O(1) hash table 2020-06-03 09:09:52 -04:00
nmeyer-ur
5ee3ea2144 round-up after testing of prefetches in stencil close 2020-06-03 11:58:20 +02:00
Peter Boyle
c5c2dbc0ce Optional CUDA info 2020-06-02 14:21:49 -04:00
Christoph Lehner
9fcb47ee63 Explicit error message instead of infinite loop in GlobalSharedMemory::GetShmDims 2020-06-02 07:44:38 -04:00
nmeyer-ur
5050833b42 revert changes due to performance penalty in Wilson using MPI 2020-06-02 13:08:57 +02:00
nmeyer-ur
7bee4ebb54 correct predication for svcadd 2020-06-02 10:51:39 +02:00
nmeyer-ur
71cf9851e7 correct type for vecd in TimesI and TimesMinusI 2020-06-02 10:44:15 +02:00
nmeyer-ur
b4735c9904 correct zero in svcadd 2020-06-02 10:38:05 +02:00
nmeyer-ur
9b2699226c use fcadd in TimesI and TimesMinusI instead of tbl and neg 2020-06-02 10:32:44 +02:00
nmeyer-ur
5f52804907 update calculation of data 2020-05-30 10:55:17 +02:00
nmeyer-ur
936071773e correct throughput in wilson and dwf 2020-05-29 22:15:59 +02:00
nmeyer-ur
1732f9319e more mods; counters seem to work correctly 2020-05-29 18:44:00 +02:00
nmeyer-ur
91c81cab30 some corrections; compiles on my laptop; untested 2020-05-29 18:19:22 +02:00
nmeyer-ur
38164f8480 include counters in WilsonFermionImplementation.h 2020-05-29 17:59:26 +02:00
nmeyer-ur
f013979791 add counter support in WilsonFermion.h 2020-05-29 17:13:59 +02:00
nmeyer-ur
e947b563ea add space in stencil output 2020-05-29 17:11:17 +02:00
nmeyer-ur
5cb3530c34 enable counters in Benchmark_wilson 2020-05-29 15:44:52 +02:00
nmeyer-ur
250008372f update SVE readme 2020-05-29 15:44:25 +02:00
Peter Boyle
1d252d0922 Accelerator inline 2020-05-28 11:45:25 -04:00
Peter Boyle
006cc8a8f1 Staggereed move to accelerator 2020-05-28 08:33:06 -04:00
nmeyer-ur
4fedd8d29f switch to MPI_THREAD_SERIALIZED instead of SINGLE 2020-05-27 14:08:34 +02:00
Peter Boyle
cf2938688a Sycl unhappy fix 2020-05-25 08:36:53 -07:00
Peter Boyle
ee63721bad int unhappiness sycl fix 2020-05-25 08:36:24 -07:00
Peter Boyle
22c5168d70 Sycl happier 2020-05-25 08:35:56 -07:00
Peter Boyle
949ac3cd24 Must avoid non-trivial copy constructors 2020-05-25 08:35:28 -07:00
Peter Boyle
7bc0166c1c SYCLL maknig happy - must avoid non ttrivial copy constructors 2020-05-25 08:34:19 -07:00
Peter Boyle
cb0d1b3399 hopefullly fix buildd fail 2020-05-24 21:27:00 -04:00
Peter Boyle
d1f1ccc705 HIP changes 2020-05-24 21:18:49 -04:00
Peter Boyle
c7519a237a Assertions fail on HIP foor unknown reasons - dedbugging 2020-05-24 14:02:47 -04:00
Peter Boyle
32be2b13d3 Updates for HiP 2020-05-24 14:00:55 -04:00
Peter Boyle
92b342a477 Hip reduction too 2020-05-24 13:50:28 -04:00
Peter Boyle
556da86ac3 HIP fp16 2020-05-24 13:41:58 -04:00
Peter Boyle
8285e41574 View location / access mode 2020-05-21 16:14:41 -04:00
Peter Boyle
f999408e92 View locatoin and access mode 2020-05-21 16:14:20 -04:00
Peter Boyle
a7abda89e2 View location & access mode 2020-05-21 16:13:59 -04:00
Peter Boyle
7860a50f70 Make view specify where and drive data motion - first cut.
This is a compile tiime option --enable-unified=yes/no
2020-05-21 16:13:16 -04:00
nmeyer-ur
6ddcef1bca fix build error enabling fcmla/mac in vector types for VLA 2020-05-21 21:21:03 +02:00
nmeyer-ur
8c5a5fdfce disable fcmla in vector type building for VLA 2020-05-21 19:41:42 +02:00
nmeyer-ur
046b1cbbc0 enable fcmla in tensor arithmetics; fixed-size works, VLA does not compile 2020-05-21 19:39:07 +02:00
nmeyer-ur
a65ce237c1 clean up; Exch1 VLA sp+dp integrate, tested, working 2020-05-21 09:48:06 +02:00
nmeyer-ur
cd27f1005d clean up; Exch1 sp integrate, tested, working 2020-05-21 08:45:43 +02:00
nmeyer-ur
f8c0a59221 clean up; Exch1 dp integrate, tested, working 2020-05-21 02:48:14 +02:00
nmeyer-ur
832485699f save some cycles in HtoD and DtoH by direct instead of multi-pass conversion 2020-05-20 23:04:35 +02:00
nmeyer-ur
81484a4760 symmetrize Mult and MultAddComplex 2020-05-20 22:36:45 +02:00
nmeyer-ur
9a86059761 symmetrize VLA and fixed size build messages 2020-05-20 20:05:42 +02:00
nmeyer-ur
b780b7b7a0 guard prevents multiple TOFU messages 2020-05-20 19:20:59 +02:00
nmeyer-ur
9e085bd04e guard prevents multiple A64FX build messages 2020-05-20 19:16:30 +02:00
ferben
6c6812a5ca GB/s output 2020-05-20 12:26:57 +01:00
Christoph Lehner
8358ee38c4 pull develop 2020-05-19 08:56:18 -04:00
ferben
1f154fe652 some cleanup in BaryonUtils 2020-05-19 13:48:56 +01:00
ferben
d708c0258d some cleanup in BaryonUtils 2020-05-19 13:48:00 +01:00
Christoph Lehner
a7635fd5ba summit mem 2020-05-18 17:52:26 -04:00
nmeyer-ur
6b6bf537d3 comment out mac in vector types 2020-05-18 20:36:16 +02:00
nmeyer-ur
323a651c71 correct typo 2020-05-18 19:58:27 +02:00
nmeyer-ur
9f212679f1 support fcmla in vector_types, untested 2020-05-18 19:55:18 +02:00
nmeyer-ur
032f7dde1a update SVE readme, asm generator 2020-05-18 19:10:36 +02:00
Peter Boyle
ebb60330c9 Automatic data motion options beginning 2020-05-17 16:34:25 -04:00
5aa60be17d SerialisableClassName method for serialisable enum, and boolean to test if a serialisable object is an enum 2020-05-15 20:00:34 +01:00
nmeyer-ur
50b1db1e8b implemented correct _m form (using 3 operands instead of 2) 2020-05-15 10:01:05 +02:00
nmeyer-ur
015d8bb38a introduced assertions in Benchmark_wilson, removed data output from Benchmark_dwf 2020-05-15 09:15:50 +02:00
nmeyer-ur
10a34312dc some fixed-size code clean up 2020-05-14 23:20:16 +02:00
nmeyer-ur
db8c0e7584 replaced _x form with _m form when using even/odd predication 2020-05-14 23:17:35 +02:00
Christoph Lehner
32fbdf4fb1 Merge pull request #5 from paboyle/develop
Sync upstream
2020-05-13 09:02:56 +02:00
Peter Boyle
a9847aa866 Dependence fix 2020-05-12 20:03:37 -04:00
Peter Boyle
2e652431e5 No compile on summiit fix 2020-05-12 18:56:47 -04:00
Peter Boyle
8b5b55b682 Make tests all compile ccurrent Grid, mostly MdagM removal of norms fixes but a few minor
issues fiixed too
2020-05-12 17:57:24 -04:00
Peter Boyle
0e3c49f687 TransposeIndex was broken by Christoph 2020-05-12 17:57:01 -04:00
Peter Boyle
cb7ee37562 Close expressions in arg to cshift 2020-05-12 17:56:40 -04:00
Peter Boyle
82f71643a4 Remove the norm in MdagM 2020-05-12 17:55:53 -04:00
nmeyer-ur
d15ccad8a7 switched to vec* in Reduce 2020-05-12 20:41:14 +02:00
nmeyer-ur
0009b5cee8 updated SVE_README 2020-05-12 19:02:33 +02:00
nmeyer-ur
20d1941a45 enabled asm kernels for fixed-size A64FXFIXEDSIZE 2020-05-12 19:01:12 +02:00
Peter Boyle
d24d8e8398 Use X-direction as more bits meaningful on CUDA.
2^31-1 shoulddd always bee enough for SIMD and thread reduced local volume

e.g. 32*2^31 = 2^36 = (2^9)^4 or 512^4 ias big enough.

Where 32 is gpu_threads * Nsimd = 8*4
2020-05-12 10:35:49 -04:00
Christoph Lehner
162e4bb567 no automatic prefetching for now 2020-05-12 07:01:23 -04:00
Peter Boyle
07c0c02f8c Speed up Cshift 2020-05-11 17:02:01 -04:00
Peter Boyle
8c31c065b5 Keep the Vector fixed to protect it from realloc 2020-05-11 17:00:30 -04:00
nmeyer-ur
b7c76ede29 Removed some assertions in Test_simd and removed exit() in Reduce 2020-05-11 22:43:00 +02:00
nmeyer-ur
05edf803bd corrected typo 2020-05-12 03:59:59 +09:00
Christoph Lehner
b1c86900b2 Merge pull request #4 from paboyle/develop
merge
2020-05-11 20:59:29 +02:00
nmeyer-ur
78b8e40f83 switched to gcc's internal data types 2020-05-11 18:11:23 +02:00
nmeyer-ur
fc2e9850d3 temporarily enable TOFU by default when using A64FX or A64FXFIXEDSIZE 2020-05-11 13:25:02 +02:00
nmeyer-ur
ffaaed679e MPI_THREAD_SINGLE hack for Fugaku, enabled by -DTOFU 2020-05-11 13:21:39 +02:00
Peter Boyle
bbbee5660d First compiile on HiP 2020-05-10 05:28:09 -04:00
Peter Boyle
ea08f193e7 Allocator cache spliit into large/small pools 2020-05-10 05:24:26 -04:00
Peter Boyle
2bb2c68e15 Separate pools for small and large allocations cache 2020-05-09 22:57:21 -04:00
Peter Boyle
efe5bc6a3c Split allocator cache into two pools of different sizes 2020-05-09 22:27:56 -04:00
nmeyer-ur
b2fd8b993a fixed-size clean up 2020-05-09 22:53:42 +02:00
nmeyer-ur
291ee8c3d0 updated fixed-size implementation; only Exch1 and prefetches missing 2020-05-09 22:18:02 +02:00
nmeyer-ur
e1a5b3ea49 unions for tables eliminate explicit loads, gcc does not complain 2020-05-09 21:21:57 +02:00
nmeyer-ur
55a55660cb reverted changes 2020-05-09 12:48:42 +02:00
Peter Boyle
384da487bd Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-05-08 18:55:11 -04:00
Peter Boyle
ee1de82a53 Working ITT benchmark again 2020-05-08 18:54:50 -04:00
Peter Boyle
2b576fc185 Comment deadd codde remove 2020-05-08 18:54:29 -04:00
Peter Boyle
52081acfa5 NVCC compile fixes 2020-05-08 13:14:12 -04:00
Peter Boyle
b01b7f761a Merge pull request #283 from DanielRichtmann/feature/minor-fixes
Some small fixes
2020-05-08 10:52:03 -04:00
Daniel Richtmann
c83471bfd0 Fix missing checkerboards for adj und conjugate 2020-05-08 16:44:03 +02:00
Daniel Richtmann
ab0c5d77fb Correct NonHermitianSchurOperatorBase 2020-05-08 16:44:02 +02:00
Daniel Richtmann
779e3c7442 Const-correctness for retrieval routines of GridStopWatch 2020-05-08 16:43:52 +02:00
Daniel Richtmann
0c570824f2 Add missing declaration of GridCmdOptionInt 2020-05-08 16:43:51 +02:00
Peter Boyle
f8b8e00090 Systematise the accelerator primitives and locate to Grid/threads/Accelerator.h / Accelerator.cc
Aim to reduce the amount of cuda and other code variations floating around all over the place.

Will move GpuInit iinto Accelerator.cc from Init.cc
Need to worry about SharedMemoryMPI.cc and the Peer2Peer windows
2020-05-08 06:23:55 -07:00
Peter Boyle
0dd1bdfa94 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-05-08 09:21:43 -04:00
Peter Boyle
1d65e2f62c Slightly faster Chebyshev; ifdef'ed out the fastest until tested numerics
Lifteed from HDCR setup
2020-05-08 09:20:54 -04:00
Peter Boyle
93920c4811 Remove verbose 2020-05-08 09:19:54 -04:00
Peter Boyle
6859a3e1d4 Schur operator 2020-05-08 09:19:12 -04:00
Peter Boyle
21ca182c36 Comments remove 2020-05-08 09:18:24 -04:00
nmeyer-ur
ceb8b374da API change v3 2020-05-08 15:04:44 +02:00
nmeyer-ur
4bc2ad2894 API change v2 2020-05-08 15:00:25 +02:00
nmeyer-ur
798af3e68f retry changing StoD API 2020-05-08 14:34:59 +02:00
nmeyer-ur
b0ef2367f3 testing alternate call to PrecisionChange 2020-05-08 14:22:44 +02:00
nmeyer-ur
71a7350a85 changed 2nd argument in Reduce to native vector type 2020-05-08 12:26:51 +02:00
nmeyer-ur
6f79369955 trying to get rid of macro definition error 2020-05-08 12:19:24 +02:00
nmeyer-ur
f9cb6b979f corrected more typos 2020-05-08 12:11:01 +02:00
nmeyer-ur
ed4d9d17f8 corrected type 2020-05-08 12:09:22 +02:00
nmeyer-ur
fbed02690d some changes in breaking out A64FX: use -DA64FXFIXEDSIZE for fixed size, but also define GEN 2020-05-08 12:05:31 +02:00
nmeyer-ur
39f3ae5b1d corrected more types 2020-05-08 11:07:14 +02:00
nmeyer-ur
e64bec8c8e pulled SVE typedefs out of Optimization 2020-05-08 11:04:21 +02:00
nmeyer-ur
0893b4e552 fixed typos in PrecisionChange 2020-05-08 10:59:07 +02:00
nmeyer-ur
92f0f29670 fixed double overloading vecf in Div, corrected typos 2020-05-08 10:57:23 +02:00
nmeyer-ur
48a340a9d1 GEN seems to defined by default -> some fixes applied 2020-05-08 10:47:49 +02:00
nmeyer-ur
f45621109b placed typedefs in Optimization 2020-05-08 10:41:52 +02:00
nmeyer-ur
32d1a0bbea added even more debug output 2020-05-08 10:39:26 +02:00
nmeyer-ur
267cce66a1 added more debug output 2020-05-08 10:29:28 +02:00
nmeyer-ur
3417147b11 added real fma, corrected typos in tbls; integrated, must supply A64FXGCC with GEN in configure 2020-05-08 10:20:19 +02:00
nmeyer-ur
b338719bc8 first transition to fixed-size done, excl. Exch; next step: integration 2020-05-07 22:33:28 +02:00
nmeyer-ur
2b81cbe2c2 first attempt to introduce tables using fixed-size; still incomplete 2020-05-07 22:01:19 +02:00
nmeyer-ur
acff9d6ed2 transition to fixed size data types almost done; still incomplete 2020-05-07 21:24:07 +02:00
053b4dd495 Merge pull request #282 from felixerben/baryon-reversal
Baryon reversal
2020-05-07 18:09:17 +01:00
nmeyer-ur
a306a49788 first mods for fixed size; still incomplete 2020-05-07 19:07:49 +02:00
ferben
42bb5f0721 asserrtion 2020-05-07 18:06:12 +01:00
ferben
253bcc3426 back to old version 2020-05-07 18:03:17 +01:00
a887206413 Merge pull request #281 from felixerben/feature/baryonSpeedup
Feature/baryon speedup
2020-05-07 13:41:29 +01:00
ferben
591ebb6213 Merge branch 'develop' of github.com:paboyle/Grid into feature/baryonSpeedup 2020-05-07 11:13:21 +01:00
ferben
56e2f7d088 deleted test routines. cleaned up fast version. assert Ns=4,Nc=3. 2020-05-07 10:03:45 +01:00
nmeyer-ur
7ef03c5368 updated SVE readme 2020-05-06 16:30:37 +02:00
Peter Boyle
525418abfb Merge pull request #273 from lehner/feature/gpt
Feature/gpt
2020-05-06 10:10:51 -04:00
Peter Boyle
5f780806c2 Merge pull request #279 from paboyle/bugfix/nvcc-config
configure fix for nvcc with extra arguments as CXX
2020-05-06 10:07:52 -04:00
Christoph Lehner
3c6ffcb48c Merge branch 'develop' into feature/gpt 2020-05-06 15:03:35 +02:00
Christoph Lehner
87984ece7d add Lattice_basis.h 2020-05-06 08:47:18 -04:00
Christoph Lehner
e9b295f967 Synchronize blocking infrastructure with GPT 2020-05-06 08:42:28 -04:00
Peter Boyle
224cbf0453 Merge pull request #280 from mmphys/bugfix/ET_go_home
Bugfix/et go home
2020-05-05 17:56:51 -04:00
Michael Marshall
c1e57d4357 Merge branch 'develop' into bugfix/ET_go_home
* develop:
  SYCL prep - no sycl just make it compile through DPC++
  dpc++ didn't like rdtsc()
  Make compile if HAVE_LIME=0
  Lime optional
2020-05-05 22:35:04 +01:00
Peter Boyle
28a1fcaaff First compile against SYCL 2020-05-05 11:13:27 -07:00
Christoph Lehner
6b64727161 disable comments 2020-05-05 05:05:36 -04:00
Christoph Lehner
04863f8f38 debug new AcceleratorView 2020-05-04 16:07:03 -04:00
u37294
04927d2e40 SYCL prep - no sycl just make it compile through DPC++ 2020-05-04 10:28:29 -07:00
u37294
7caed4edd9 dpc++ didn't like rdtsc() 2020-05-04 10:27:05 -07:00
u37294
59c51d2c35 Make compile if HAVE_LIME=0 2020-05-04 10:26:20 -07:00
u37294
ff53b231c8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-05-04 10:25:10 -07:00
u37294
fc19cf905b Lime optional 2020-05-04 10:24:48 -07:00
Christoph Lehner
2a1387e992 rankInnerProduct 2020-05-03 17:27:11 -04:00
Christoph Lehner
9bfa51bffb cleanup comment 2020-05-03 09:12:52 -04:00
Christoph Lehner
38532753f4 interface cleanup 2020-05-03 08:58:32 -04:00
Christoph Lehner
949be9605c fix pragmas 2020-05-02 16:20:03 -04:00
Christoph Lehner
63cf201ee7 Add AdviseInfrequentUse 2020-05-02 11:38:42 -04:00
Christoph Lehner
c8af498a2a BinaryIO fix for alternative little-endian format name (used in 96I ensemble) 2020-05-01 03:45:50 -04:00
Christoph Lehner
ddb192bac7 re-work double precision promotion for summit 2020-04-30 16:09:57 -04:00
Michael Marshall
7666300a6f Merge branch 'develop' into bugfix/ET_go_home
* develop:
  Basis rotate stack passig to GPU reduction
  Clean up warning
2020-04-30 20:10:32 +01:00
Michael Marshall
4a4b9e305d Fix: strToVec enters infinite loop and exhausts memory if operator>> fails before the end of string, e.g. if parsing "0_0_0" for momentum instead of "0 0 0". 2020-04-30 19:40:04 +01:00
Peter Boyle
9b2d2d0fc3 Basis rotate stack passig to GPU reduction 2020-04-30 12:31:07 -04:00
Peter Boyle
5011753f4f Clean up warning 2020-04-30 10:23:48 -04:00
Michael Marshall
dbaeefaeef All Eigen::TensorMap objects are fixed (i.e. cannot be dynamically resized) 2020-04-30 15:02:51 +01:00
Christopher Kelly
dee96cbf82 Added workaround in configure to still catch Cuda compiler when nvcc with extra arguments (eg -ccbin) is used as CXX 2020-04-29 10:37:11 -04:00
Peter Boyle
dd3ebc2ce4 Slow compile on NVCC switch off conserved current 2020-04-29 08:43:12 -04:00
Peter Boyle
103e7ae2f0 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-04-29 03:05:36 -04:00
Peter Boyle
29ae5615c0 Seqeuential fix 2020-04-29 03:05:15 -04:00
ferben
6240e02619 added assertion to avoid potential infinite loop 2020-04-27 18:50:53 +01:00
ferben
f4033ad8cb baryon speedup by a factor 2 2020-04-27 17:46:14 +01:00
nmeyer-ur
5abec5b8a9 SVE_readme update, update Grid_vector_types.h 2020-04-25 13:48:26 +02:00
nmeyer-ur
499edc0636 updated SVE_README.txt; defined ARMCLANGCOMPAT macro 2020-04-25 13:41:24 +02:00
nmeyer-ur
d990e61be3 armclang 20.1 settings in SVE readme 2020-04-25 12:11:43 +02:00
nmeyer-ur
3edb2dc2da removed -static from gcc CXXFLAGS 2020-04-24 13:04:34 +02:00
Christoph Lehner
f1fe444d4f blocked precision promotion infrastructure upgrade 2020-04-24 06:27:20 -04:00
nils meyer
345721220e resolved merge conflict 2020-04-24 10:14:21 +02:00
nils meyer
6db68d6ecb added SVE configure for armclang and gcc 2020-04-24 10:10:47 +02:00
Peter Boyle
dae820aa96 Merge pull request #277 from mmphys/bugfix/grid-config
Bugfix/grid config
2020-04-23 10:26:54 -04:00
Michael Marshall
5daf176f4a Updated to expose GRID_CXXLD in addition to CXXLD.
NB: CXXLD required as this is what drives linking behaviour.
2020-04-23 15:25:53 +01:00
Michael Marshall
e96c86ec14 Make grid-config message more specific for --cxx and --cxxld 2020-04-23 13:10:45 +01:00
nmeyer-ur
09f0963d1f changes in configure.ac ; to be verified 2020-04-23 11:27:03 +02:00
nils meyer
6f44e3c192 reverted changes in configure.ac ; included SVE configure readme 2020-04-23 11:18:50 +02:00
Peter Boyle
c2c3cad20d Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2020-04-23 04:35:42 -04:00
Peter Boyle
edec9ee2e2 Conserved current rewrite done. Zmobius working 2020-04-23 04:34:01 -04:00
Peter Boyle
ed70cce542 Test for 5D DWF obserevables 2020-04-23 04:29:45 -04:00
Michael Marshall
4701201b5f grid-config: Expose CXXLD (for GPU build) and update help 2020-04-22 18:42:30 +01:00
nils meyer
5893888f87 removed default no-strict-aliasing for gcc-10.0.1 exclusively 2020-04-22 19:29:55 +02:00
nmeyer-ur
39b448affb Merge remote-tracking branch 'origin/develop' into feature/a64fx-2 2020-04-22 17:34:12 +02:00
nils meyer
e54a8f05a9 Exchange1 with generic version for now, should use svtbl2 in final version 2020-04-20 22:45:27 +02:00
Peter Boyle
0782b76ed4 Merge pull request #274 from paboyle/feature/zmobius_paramcompute
ZMobius parameter computation
2020-04-20 14:39:29 -04:00
Christopher Kelly
0896f2cead Added missing include guards in bigfloat_double.h 2020-04-20 10:30:38 -04:00
Christopher Kelly
181709bba4 Merge branch 'develop' into feature/zmobius_paramcompute 2020-04-20 09:12:34 -04:00
nils meyer
64b72fc17f testing gcc 10.0.1: build errors in Exchange1 using -DA64FX and in Lattice_base.h building Dslash only 2020-04-19 01:25:40 +02:00
Christoph Lehner
091d5c605e towards more precise blocking 2020-04-17 04:25:28 -04:00
nils meyer
6fdce60492 revised BodyA64FX; 990 GiB/s Wilson, 687 GiB/s DW using intrinsics (armclang 20.0) 2020-04-16 22:43:32 +02:00
Peter Boyle
90229cfb0f Merge pull request #270 from milc-qcd/feature/CGinfo
feature/CGinfo
2020-04-16 11:46:08 -04:00
Peter Boyle
0475c46ecb Merge pull request #256 from djm2131/feature/BiCGSTAB
Import BiCGSTAB solvers and tests
2020-04-16 11:45:15 -04:00
Peter Boyle
3cca10e617 Merge pull request #276 from nils-asmussen/fix/regression_nt
fix regression in tests/core/Test_qed.cc
2020-04-16 11:42:39 -04:00
Christoph Lehner
327da332bb Merge branch 'develop' of https://github.com/paboyle/Grid into feature/gpt 2020-04-16 11:30:17 -04:00
nils meyer
852db4626a re-introduced HOTFIX cause Grid binaries give wrong results otherwise; checked in good gridverter.py 2020-04-15 18:22:19 +02:00
43dc2814dd fix regression in core/Test_qed.cc 2020-04-15 16:10:15 +01:00
nils meyer
6504a098cc 999 GiB/s Wilson; 694 GiB/s DW (DP) 2020-04-15 15:06:52 +02:00
nils meyer
79a385faca disabled armclang hotfix cause armclang 20.0 performance gets a little 2020-04-15 11:46:55 +02:00
nils meyer
c12a67030a 980 GiB/s Wilson; 680 GiB/s DW (DP) 2020-04-15 10:55:06 +02:00
nils meyer
581392f2f2 now with pf, best results so far using intrinsics+pf 2020-04-12 22:06:14 +02:00
nils meyer
113f277b6a enable dslash asm using -DA64FXASM, additionaly -DDSLASHINTRIN for intrinsics impl 2020-04-11 04:55:01 +02:00
Peter Boyle
f3a8d039a2 Merge branch 'feature/hdcr' into develop 2020-04-10 22:01:52 -04:00
nils meyer
974586bedc Dslash finally works; cleaned up; uses MOVPRFX in assembly 2020-04-10 22:26:40 +02:00
4e864e56c9 develop pull 2020-04-10 17:19:18 +01:00
Peter Boyle
014dbfa464 Compile fix with OpDirAll 2020-04-10 11:57:09 -04:00
Peter Boyle
3b0e07882f Adding another form of polynomial 2020-04-10 11:28:33 -04:00
Peter Boyle
8e81a811d0 Merge branch 'feature/hdcr' into develop 2020-04-10 11:14:49 -04:00
Peter Boyle
aa13118127 Missing conjugate already fixed in develop 2020-04-10 11:11:24 -04:00
Peter Boyle
6cdb09c884 Faster copy region 2020-04-10 11:10:52 -04:00
Peter Boyle
a65bc64f10 Accelerator peek poke 2020-04-10 11:09:59 -04:00
Peter Boyle
11dec4883c Don't throw assert 2020-04-10 11:09:11 -04:00
Peter Boyle
afa458c812 Extra solvers 2020-04-10 11:08:19 -04:00
Peter Boyle
dc50190b8f Faster GPU basis rotation
May need to later include Regensburg optimised CPU variant
2020-04-10 11:06:04 -04:00
nmeyer-ur
160f78c1e4 changed debug output to variable direct 3 2020-04-10 12:23:07 +02:00
nmeyer-ur
7e4e1bbbc2 changed debug output to variable direct 2 2020-04-10 12:22:04 +02:00
nmeyer-ur
e699b7e9f9 changed debug output to variable direct 2020-04-10 12:18:30 +02:00
nmeyer-ur
a28bc0de90 debug register address test in WilsonHand 2020-04-10 12:07:45 +02:00
nmeyer-ur
14d0fe4d6c added predication in WilsonHand 2020-04-10 12:04:00 +02:00
nmeyer-ur
0ad2e0815c debug output in WilsonHand 2020-04-10 11:56:29 +02:00
nils meyer
1c8ca05e16 Merge branch 'feature/a64fx-2' of https://github.com/nmeyer-ur/Grid into feature/a64fx-2 2020-04-09 23:32:19 +02:00
nils meyer
dc9c8340bb switched to DSLASHINTRIN for A64FX Dslash intrinsics 2020-04-09 23:30:23 +02:00
nils meyer
19eef97503 specialized A64FX Dslash kernels 2020-04-09 23:25:25 +02:00
nmeyer-ur
635246ce50 corrected typo 2020-04-09 21:42:50 +02:00
nils meyer
5cdbb7e71e fixed A64FX Dslash; compiles, but does not specialize -> assertion 2020-04-09 21:23:39 +02:00
nmeyer-ur
8123590a1b changes 2020-04-09 16:45:47 +02:00
nmeyer-ur
86c9c4da8b changes 2020-04-09 16:40:06 +02:00
nmeyer-ur
cd1efee866 changes 2020-04-09 16:35:13 +02:00
nmeyer-ur
bd310932f7 changes 2020-04-09 16:32:31 +02:00
nmeyer-ur
304762e7ac changes 2020-04-09 16:26:01 +02:00
nmeyer-ur
d79ab03a6c changes 2020-04-09 16:19:25 +02:00
nmeyer-ur
d5708e0eb2 more changes 2020-04-09 15:43:34 +02:00
nmeyer-ur
123f6b7a61 more changes 2020-04-09 15:17:19 +02:00
nmeyer-ur
2b6457dd9a added xp/xm recon accum 2020-04-09 15:13:19 +02:00
nmeyer-ur
b367cbd422 defined ADD_RESULT 2020-04-09 15:08:45 +02:00
nmeyer-ur
e252c1aca3 addressing 2020-04-09 15:03:12 +02:00
nmeyer-ur
b140c6a4f9 addressing 2020-04-09 15:01:15 +02:00
nmeyer-ur
326de36467 revised sU addressing scheme 2020-04-09 14:44:25 +02:00
nmeyer-ur
9f224a1647 fixed typo in single 2020-04-09 14:30:21 +02:00
nmeyer-ur
bb46ba9b5f fixed array size in single 2020-04-09 14:28:45 +02:00
nmeyer-ur
dd5a22b36b revised declarations 2020-04-09 14:21:27 +02:00
nmeyer-ur
1ea85b9972 Disabled build message 2020-04-09 13:47:21 +02:00
nmeyer-ur
8fb63f1c25 added A64FX Wilson kernels single precision 2020-04-09 13:41:04 +02:00
nmeyer-ur
77fa586f6c introduced A64FX Wilson kernels 2020-04-09 13:30:06 +02:00
Christoph Lehner
96e8e44fd4 Merge pull request #2 from DanielRichtmann/feature/fused-innerproduct-norm2
Fused innerProduct + norm2 on first argument operation
2020-04-06 13:16:58 +02:00
Daniel Richtmann
5fc8a273e7 Fused innerProduct + norm2 on first argument operation 2020-04-06 11:52:29 +02:00
d671a63e78 Update README.md 2020-04-03 19:52:15 +01:00
nmeyer-ur
15238e8d5e reduce acle works, clean up 2020-04-03 20:40:44 +02:00
nmeyer-ur
b27e31957a reduce acle revised 2020-04-03 19:46:15 +02:00
nmeyer-ur
46927771e3 reduce acle still needs overhaul 2020-04-03 19:30:48 +02:00
nmeyer-ur
d8cea77707 define simd width in header 2020-04-03 19:22:25 +02:00
nmeyer-ur
5f8a76d490 clean up, reduction in acle 2020-04-03 19:18:24 +02:00
nmeyer-ur
28d49a3b60 build problem resolved 2020-04-03 16:52:48 +02:00
nmeyer-ur
b4c624ece6 added A64FX support 2020-04-03 15:43:23 +02:00
2c22db841a Added momentum scaling to scalar HMC theories in order to follow UKQCD/CPS conventions 2020-04-02 17:38:47 +01:00
Christoph Lehner
856d168e41 global sum over vectors of uint64_t 2020-03-29 07:56:05 -04:00
6235c7ba98 IPP path fix in configure 2020-03-27 17:23:29 +00:00
7e13724882 removing Hadrons 2020-03-27 12:03:32 +00:00
Christoph Lehner
b6cbdd2aa3 Merge pull request #1 from DanielRichtmann/feature/read-openqcd
Feature/read openqcd
2020-03-26 17:39:04 +01:00
Christoph Lehner
a2188ea875 remove debugging printf from WilsonKernelsImplementation 2020-03-26 09:12:36 -04:00
Daniel Richtmann
989af65807 Check in parallel reader for openqcd configs 2020-03-24 11:20:54 +01:00
Christoph Lehner
60db3133d3 make trace,adj,transpose unary operators 2020-03-16 17:59:56 -04:00
Christoph Lehner
c9b737a4e7 make trace,adj,transpose unary operators 2020-03-16 17:58:30 -04:00
Daniel Richtmann
037bb6ea73 Check in reader for openqcd configs
This reader is suboptimal in the sense that it opens the entire config on every MPI rank.
2020-03-16 14:28:02 +01:00
05ebc458e2 Merge pull request #260 from mmphys/feature/distil
Distillation: save eigenvalues of the Laplacian for all timeslices
2020-03-13 14:00:21 +00:00
Michael Marshall
3753508957 Making change 1) as simple as possible 2) as much like MSink/Point.hpp as possible 2020-03-12 13:47:51 +00:00
Michael Marshall
c1677fccf6 Merge branch 'develop' into feature/distil
* develop:
  bugfix ZPerambulator
  registered module supporting ZMobius action
  changed to push_back according to request
  Added Hadrons_Error in case blockSize is set too large
  bugfix in perambulator module

# Conflicts:
#	Hadrons/Modules/MDistil/Perambulator.hpp
2020-03-12 12:45:18 +00:00
35e8e31749 Merge pull request #272 from mmphys/feature/ZPeramb
bugfix ZPerambulator
2020-03-12 12:28:04 +00:00
34813e9b04 Merge branch 'develop' into feature/ZPeramb 2020-03-12 12:27:56 +00:00
Felix Erben
373cf61abb bugfix ZPerambulator 2020-03-12 11:44:43 +00:00
4e8fbc4b49 Merge pull request #271 from mmphys/feature/ZDistil
registered module supporting ZMobius action
2020-03-12 10:54:07 +00:00
ferben
516ac1d4d5 registered module supporting ZMobius action 2020-03-12 10:52:27 +00:00
318f63eb34 Merge pull request #268 from mmphys/a2a-error-log
Added Hadrons_Error in case blockSize is set too large
2020-03-11 11:09:00 +00:00
16503d7532 Merge pull request #267 from mmphys/feature/distil-bugfix
bugfix in perambulator module
2020-03-11 11:08:23 +00:00
ferben
0fa93383b7 changed to push_back according to request 2020-03-11 09:05:01 +00:00
ferben
0a827aa7bf Added Hadrons_Error in case blockSize is set too large 2020-03-11 08:52:52 +00:00
Carleton DeTar
165c68e28e Change TrueResiduals to TrueResidualShift and IterationsToComplete to IterationsToCompleteShift 2020-02-29 17:51:51 -06:00
ferben
b32b1ca642 bugfix in perambulator module 2020-02-26 12:06:45 +00:00
Carleton DeTar
9479bc8486 Make IterationsToComplete and TrueResidual externally accessible 2020-02-19 17:43:57 -06:00
Peter Boyle
8a5c13d5fb Still fast moving in changes 2020-02-06 17:57:26 -05:00
Peter Boyle
bdccb0c91f Working 2 types of decomposition 2020-02-06 17:26:55 -05:00
Peter Boyle
68b45f6444 Lower left/upper right region cut paste 2020-02-06 15:50:26 -05:00
Peter Boyle
ef9b3e658a extra typedef 2020-02-06 15:47:14 -05:00
Peter Boyle
b9ca40cc44 More precise power method at start 2020-02-06 10:09:14 -05:00
Peter Boyle
2f421a5db1 Commeent fix 2020-02-06 10:08:27 -05:00
Michael Marshall
10192dfc71 Wall source momenta must be specified for spatial components only.
So we don't break existing scripts, allow momentum in time direction as well, but only if zero.
Fail early, so do the check in setup()
2020-01-31 15:02:03 +00:00
Michael Marshall
c69a3b6ef6 When saving eigenvectors, LapEvec now saves eigenvalues for every timeslice as well.
I.e. nT x nVec eigenvalues are saved in FileName.evals.conf.h5.
A new named tensor, "TimesliceEvals" can be used to simplify restoring these from disk.
NB: The changes in BaseIO add support so that Eigen tensors can be easily used in MPI operations, e.g. GlobalSum.
See LapEvec.hpp for an example of how this is done.
2020-01-29 21:20:20 +00:00
Peter Boyle
852fc1b001 True Hierachical multigrid for DWF 2020-01-27 13:45:10 -05:00
Peter Boyle
2b5de5bba5 MdagM operator without norm option 2020-01-27 13:44:30 -05:00
Peter Boyle
2e85cae74e Add Jacobi polynomials 2020-01-27 13:43:49 -05:00
Peter Boyle
76c823781e Much faster coarsening 2020-01-27 13:43:19 -05:00
Peter Boyle
114db3b99d Optional MdagM without norms 2020-01-27 13:42:51 -05:00
Peter Boyle
49e123dbda Use explicit linalg calls to get coalesce optimisations on GPU 2020-01-27 12:44:51 -05:00
Peter Boyle
8cec294ec9 Make CG a bit less verbose as gettign annoying in nested algorithms.
Can use Iterative logging if you want to see more
2020-01-27 12:44:04 -05:00
Peter Boyle
eb5b720e94 Normal Equations can be used in HDCR now 2020-01-27 12:43:29 -05:00
Peter Boyle
b2736ec80b Make PrecGCR recursive - it can precondition itself 2020-01-27 12:42:48 -05:00
Peter Boyle
086256a032 Less sloppy convergence test on PowerMethod 2020-01-27 12:41:59 -05:00
Peter Boyle
afc7426f39 Much bigger pointer cache in case of Nvidia due to cost of setting up UVM allocations 2020-01-27 12:41:16 -05:00
Peter Boyle
7c061e20c9 All directions of dirac operator for fastt coarsening 2020-01-27 12:40:13 -05:00
Peter Boyle
e5d1c09665 Faster DhopDirAll for little dirac operator coarsening 2020-01-27 12:38:54 -05:00
Peter Boyle
8016a465ae Remove extraneous variable 2020-01-27 12:35:37 -05:00
Peter Boyle
d8b9742092 DhopDirAll for faster matrix elements of little Dirac operator 2020-01-27 12:34:54 -05:00
Peter Boyle
1bd87c35d7 Read coalescing on Nvidia 2020-01-27 12:29:56 -05:00
Peter Boyle
fa856c9669 Disable information message 2020-01-27 12:28:46 -05:00
Peter Boyle
48008e4d8b Thread coordinate creation loop 2020-01-27 12:28:16 -05:00
Peter Boyle
55cdb17691 Integer divide for blocking 2020-01-27 12:27:45 -05:00
Michael Marshall
2ed39ebb7a Perambulator won't even allocate memory for unsmeared sinks unless the filename is specified.
Prior to this update, memory is allocated regardless of whether these are requested.
2020-01-24 13:01:06 +00:00
Christopher Kelly
96671bbb24 Added ability to pass callback to MADWF that is called every inner iteration and allows user to, for example, adjust the inner solver tolerance depending on residual
Added a general implementation of the Remez algorithm for producing arbitrary rational polynomial approximation with optional restriction to even/odd polynomials
Added implementation of computation of ZMobius parameters
Added Test_zMADWF_prec to test ZMobius in MADWF
2020-01-17 12:45:30 -08:00
Peter Boyle
554542b773 Merge branch 'feature/hdcr' of https://github.com/paboyle/Grid into feature/hdcr 2020-01-06 11:47:56 -05:00
Peter Boyle
03da4040e2 Make summit happy 2020-01-06 11:47:48 -05:00
Peter Boyle
e583035614 Change to interface to minise comms in evaluating coarse space operator 2020-01-06 11:43:59 -05:00
Peter Boyle
3c3d6a94f3 OPtimising the force term a bit 2020-01-04 03:16:23 -05:00
Peter Boyle
205ea4bbb2 More verboose Lanczos 2020-01-04 03:13:40 -05:00
Peter Boyle
039eb7b2eb Make the force term and coarsening multigrid more optimised 2020-01-04 03:12:17 -05:00
Peter Boyle
f7e4bd1f6d Getting more optimised 2020-01-04 03:11:53 -05:00
Peter Boyle
0afecfcae7 Nearing well optimised state 2020-01-04 03:11:19 -05:00
Peter Boyle
ba40a3f763 Alternate low pass filter option 2020-01-03 05:29:09 -05:00
Peter Boyle
aa920aa532 Improved DWF multigrid 2019-12-28 10:32:35 -05:00
Peter Boyle
c0d8e4dce5 Improved Multigrid for DWF 2019-12-28 10:32:15 -05:00
Michael Marshall
0ca1992151 Remove warning in tensor layout comparison. Make default names and index names visible for PerambTensor and NoiseTensor 2019-12-20 13:53:27 +00:00
Michael Marshall
df2b0c4e79 Merge branch 'develop' into feature/distil
* develop:
  Missing conjugate in MooeeInvDag
  Allow subspace setup to no converge
  fp16 mandatory. Use SFW is not available as hdw
2019-12-20 13:24:59 +00:00
Peter Boyle
9cfd64c604 Coarse grid on GPU, not fast enough yet. Need a 10x 2019-12-17 05:24:45 -05:00
Peter Boyle
e478404291 Tuned up significantly on GPU, but another 10x in coarse space required 2019-12-17 05:03:25 -05:00
Peter Boyle
9aafd20468 Simple block project promote runs faster on GPU 2019-12-17 05:01:39 -05:00
Peter Boyle
5d834486c9 Merge pull request #259 from grid-test-organisation/feature/5d-improvement-fix
Missing conjugate in MooeeInvDag
2019-12-16 04:20:37 -05:00
gfilaci
f7373e97a4 Missing conjugate in MooeeInvDag 2019-12-16 10:05:50 +01:00
Peter Boyle
9e15474999 Accelerator loop attempt at speed up 2019-12-14 05:28:16 -05:00
Peter Boyle
152b525a4d Typo fix 2019-12-13 22:44:42 -05:00
Peter Boyle
d18994eddc offload more of mgrid to GPU 2019-12-13 22:08:11 -05:00
Peter Boyle
b8bd8cd2ae Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-13 21:32:10 -05:00
Peter Boyle
736b19485e Faster set up and some dead code ifdef'ed out 2019-12-13 21:30:48 -05:00
Michael Marshall
c7637a84ad Documentation tweak for peculiarities of OpenMPI --prefix 2019-12-12 17:00:03 +00:00
Michael Marshall
a7772c827b Documentation tweak 2019-12-12 16:05:22 +00:00
8e83398861 Merge pull request #257 from AndrewYongZhenNing/develop
Added NamedTensor.hpp
2019-12-11 21:36:59 +00:00
David Murphy
843ca9350a Fix naming conventions to be consistent with Peter 2019-12-11 11:46:18 -05:00
f47b2b6e13 Added NamedTensor.hpp 2019-12-11 15:56:46 +00:00
Peter Boyle
5bfd1470ad Merge branch 'develop' into feature/hdcr 2019-12-10 21:51:06 -05:00
Peter Boyle
6957b0b58a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-10 21:50:42 -05:00
Peter Boyle
d73f0b8618 Verbose for temporary debug 2019-12-10 21:50:06 -05:00
Peter Boyle
0b3a3562c3 Some MPI (summit) create sigusr2, so trap that 2019-12-10 21:49:12 -05:00
Peter Boyle
710fee5d26 Subspace setup testing code
and timing verbose
2019-12-10 21:48:42 -05:00
Peter Boyle
bab0bf2e93 Merge branch 'develop' into feature/hdcr 2019-12-10 21:47:41 -05:00
Peter Boyle
848079e8ba Merge pull request #235 from grid-test-organisation/feature/5d-improvement
MooeeInv and M5D optimisations + enable threading with nvcc
2019-12-10 21:45:03 -05:00
Peter Boyle
f2a4f13111 Must offload the Coarsened matrix if Stencil buffers are device resident 2019-12-10 19:32:12 -05:00
David Murphy
4180a4a8a7 Import BiCGSTAB solvers and tests 2019-12-10 17:20:35 -05:00
b9b9fcbfa0 Merge pull request #229 from nils-asmussen/feature/JacobiSmear
MSource::jacobi smear + sort file contents of Modules.hpp and modules.inc
2019-12-09 22:50:02 +00:00
bbe48998a8 sort Modules.hpp and modules.inc + add module JacobiSmear 2019-12-09 18:06:29 +00:00
6446671a9c Merge pull request #241 from nils-asmussen/fix/remQCDns_ignore_ws
Undo whitespace changes in fix/removeQCDremnants to allow comparing relevant changes
2019-12-09 18:02:21 +00:00
110373ea79 Merge pull request #204 from nils-asmussen/sha256sum_Eigen_download
bootstrap.sh: verify checksum of Eigen tar file
2019-12-09 18:01:46 +00:00
a986786192 bootstrap.sh: verify checksum of Eigen tar file if sha256sum is installed 2019-12-09 17:11:21 +00:00
Peter Boyle
edd1c924eb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-09 03:53:01 -05:00
Peter Boyle
9b6b0caa55 Junk commit fix 2019-12-09 03:01:58 -05:00
Peter Boyle
2a48617ac5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-09 03:00:00 -05:00
Peter Boyle
876d9c957c QMR 2019-12-09 02:59:49 -05:00
Peter Boyle
295e535f93 QMR 2019-12-09 02:59:35 -05:00
Peter Boyle
58a31f0763 QMR implemented, preserve even if not used much 2019-12-09 02:59:13 -05:00
Peter Boyle
3d2fe80780 Temporary size depends on checkerboard/uncheckerboard. The Mdir cares 2019-12-09 02:58:24 -05:00
Peter Boyle
e43fce1083 Clean up and simplify a little. 2019-12-09 02:55:45 -05:00
Peter Boyle
0dfdf80407 Logging 2019-12-09 02:54:52 -05:00
Peter Boyle
2912071f83 Add non hermitian operator 2019-12-09 02:51:53 -05:00
Peter Boyle
26605ef387 HDCR back to working 2019-12-09 02:51:01 -05:00
1e5ac576d9 Merge commit 'f7698b93ca57ea3aa4d72b133ad9ca5d1e703661' into develop
# Conflicts:
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-12-06 11:59:21 +00:00
d5492b426f Hadrons: better order in module list 2019-12-06 11:56:26 +00:00
d428858c9d Merge pull request #255 from fionnoh/feature/sparseNoise
Feature/sparse noise
2019-12-06 11:43:27 +00:00
ferben
f7698b93ca corrected comments about quark line directions 2019-12-06 09:46:52 +00:00
ferben
7ce77690b8 Naming conventon also applied to metadata 2019-12-05 17:38:43 +00:00
ferben
164ed9c434 Naming conventon also applied to metadata 2019-12-05 17:38:00 +00:00
ferben
a54157e682 more definitions changed 2019-12-05 17:08:09 +00:00
ferben
58b6a0d8d1 changed some naming conditions to resemble rare-kaons 2019-12-05 16:56:54 +00:00
ferben
1a5e562bde only one FIMPL left! 2019-12-05 16:46:58 +00:00
Fionn O hOgain
45be26cf3f Merge branch 'develop' of https://github.com/fionnoh/Grid into feature/sparseNoise 2019-12-05 16:18:47 +00:00
Fionn O hOgain
5227ffccb7 Added James' sparse noise code and a module to use it 2019-12-05 15:50:03 +00:00
a0b47cc0be Merge pull request #254 from fionnoh/bugfix/eigenMigration
Updated Eigen URL after migration to gitlab
2019-12-05 15:26:38 +00:00
ferben
b766038810 new syntax after merge 2019-12-04 18:08:00 +00:00
ferben
cd9fd80a5d merged in develop 2019-12-04 17:12:46 +00:00
d6100cc35a Merge pull request #253 from mmphys/feature/distil
Fix phase convention adjustment error
2019-12-04 14:58:51 +00:00
Fionn O hOgain
29a1530510 Updated Eigen URL after migration to gitlab 2019-12-04 13:49:22 +00:00
Michael Marshall
15119eaf03 Fix phase convention adjustment error (and make no assumptions about node layout) 2019-12-04 09:59:58 +00:00
188e12ffbb Merge pull request #249 from mmphys/feature/distil
Feature/distil
2019-12-03 18:06:00 +00:00
ferben
e940f4db7e removed unused parameter parity 2019-12-03 12:01:31 +00:00
ferben
9c7f269489 typo in fimpl4 2019-12-03 11:19:54 +00:00
ferben
07feaf9531 updated ascii-doc preamble 2019-12-03 11:17:35 +00:00
Michael Marshall
7983ff2fdd Merge branch 'develop' into feature/distil
* develop:
  Change to reporting
  NVCC timer support
  Fix nocompilee under NVCC
  --enable-summit flag
  IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could be a little faster
  Sliced propagator contraction was not producing any results because buf.size()=0
  several typos in hadrons
2019-11-30 16:47:03 +00:00
Michael Marshall
2db814f2b7 Resolve conflicts in BaryonUtils (just use latest from develop) 2019-11-29 18:19:35 +00:00
Michael Marshall
6418f06771 Add option to save the eigenvectors of the Laplacian.
If they are saved, then metadata saved are:
solverXml	Parameters for this LapEvec module instance
OperatorXml	module type and parameters (if any) for the module that created the gauge field
2019-11-29 18:06:18 +00:00
8a5576f73c cleared up how exactly q_spec has to be defined 2019-11-28 12:35:18 +00:00
Peter Boyle
997790ad24 Allow subspace setup to no converge 2019-11-26 14:04:28 -05:00
Peter Boyle
900d6fad21 fp16 mandatory. Use SFW is not available as hdw 2019-11-26 13:26:43 -05:00
799ff0c96e speed-up 2019-11-26 15:28:47 +00:00
5fd5c25114 now two seperate functions for Eye and NonEye 2019-11-26 13:44:55 +00:00
62b3799c77 Merge pull request #251 from fionnoh/bugfix/WallWallMeson
MContraction::Meson bugfix
2019-11-26 12:46:03 +00:00
Peter Boyle
d1a89af8c9 Change to reporting 2019-11-22 10:49:10 -05:00
Peter Boyle
d91ba1f6cc NVCC timer support 2019-11-21 20:11:19 +00:00
Peter Boyle
f4d27e7090 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-11-21 20:09:31 +00:00
Peter Boyle
feb1ff3494 Fix nocompilee under NVCC 2019-11-21 20:03:39 +00:00
Peter Boyle
8ef6175acc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-11-21 15:02:21 -05:00
Peter Boyle
e4399e3ee1 --enable-summit flag 2019-11-21 15:02:10 -05:00
Peter Boyle
98ea67b636 IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could
be a little faster
2019-11-21 15:00:46 -05:00
ferben
421a4395af Sigma to Nucleon contractions 2019-11-21 17:25:37 +00:00
Fionn O hOgain
cf95a460a5 Sliced propagator contraction was not producing any results because buf.size()=0 2019-11-21 17:17:55 +00:00
a60e20f265 Merge pull request #250 from mmphys/hadrons-typos
several typos in hadrons
2019-11-20 17:10:08 +00:00
ferben
9261c0da89 several typos in hadrons 2019-11-20 17:06:32 +00:00
ferben
b350a24ded fixed test_distil 2019-11-18 15:29:20 +00:00
Michael Marshall
13a0db7162 Reverse changes not intended to be part of distillation release 2019-11-18 12:34:49 +00:00
Michael Marshall
18177d9709 Review changes 2019-11-18 11:59:13 +00:00
Michael Marshall
7bf42b9c0e HADRONS_ERROR 2019-11-18 10:27:35 +00:00
ferben
2d6f4e0c09 fixed issue with HADRONS_ERROR, no idea why this works 2019-11-15 13:46:47 +00:00
ferben
7f06c40107 _var -> var_ 2019-11-15 13:26:24 +00:00
ferben
9f75065205 eigen_strong_inline gone 2019-11-15 13:22:20 +00:00
ferben
271a02230e assert -> ERROR 2019-11-15 11:11:50 +00:00
ferben
b1e8b5b5ce changed default behaviour as discussed with antonin 2019-11-15 11:00:25 +00:00
ferben
25d2521d77 small stuff 2019-11-13 16:34:09 +00:00
ferben
500ef17143 beauty 2019-11-13 15:14:51 +00:00
ferben
ee9dd22643 worked on test_distil 2019-11-13 14:59:44 +00:00
ferben
a977d9901b cleanup 2019-11-13 14:52:06 +00:00
ferben
667ffb70db changed error type 2019-11-13 12:16:56 +00:00
ferben
65b3059bd7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-11-13 11:51:14 +00:00
ferben
5238808ccd No DistilVectors specified in xml no throws an error 2019-11-13 11:50:55 +00:00
Michael Marshall
8f88fee680 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  made notation DPar->dp consistent over modules
2019-11-13 11:34:10 +00:00
Michael Marshall
fcc412a1c2 Remove conditional compilation to support GPU build 2019-11-13 11:32:23 +00:00
ferben
12e415330f made notation DPar->dp consistent over modules 2019-11-13 11:21:08 +00:00
Michael Marshall
66e0811317 Attempt to fix cuda build 2019-11-13 00:02:51 +00:00
Michael Marshall
55e743aad6 Streamline 2019-11-12 23:57:28 +00:00
Michael Marshall
e2ab0d671e Implement destructors 2019-11-12 23:18:37 +00:00
Michael Marshall
7a4c5dbbd5 Restoring previous version for _reduced variables 2019-11-12 22:12:35 +00:00
Michael Marshall
3f00b8f6c7 Switch to std::unique_ptr<GridCartesian> grid3d;
Remove hand-coded reference to pi - switch to <math.h> definition
2019-11-12 21:53:09 +00:00
Michael Marshall
6d7043e0c2 NamedTensor changes done 2019-11-12 17:31:42 +00:00
ferben
b0f24ec302 Test works now 2019-11-12 15:14:13 +00:00
Michael Marshall
fb2834bf82 Oops 2019-11-12 14:01:20 +00:00
Michael Marshall
78f75b0e9f Better than graffiti 2019-11-12 14:00:46 +00:00
Michael Marshall
62dd0bfe58 New parameter module compiles. Untested. 2019-11-12 13:59:53 +00:00
ferben
db952993fa envCreate problem.. 2019-11-12 12:23:34 +00:00
ferben
b8f0878981 removed most default behaviour 2019-11-11 17:49:38 +00:00
ferben
df586a142d added DistilPar-module and cleaned up some code 2019-11-11 17:29:55 +00:00
ferben
7a446d5b7f removed default filenames 2019-11-11 14:36:45 +00:00
ferben
e7d7ea4f8f added LoadNoise module 2019-11-11 12:55:45 +00:00
Michael Marshall
f8e1941327 Implemented specialisations of NamedTensor as derived classes, however this suffers a number of problems:
1) virtual functions not available in base class constructor where I'd like to use them - e.g. IndexNames
2) Must define new constructors in derived classes
... so the specialisations are fatter than I'd like. Would prefer to revert to specifying tensor name and index name defaults in template
2019-11-08 11:55:00 +00:00
65aa54804e added comments 2019-11-08 11:15:51 +00:00
ferben
293bfe17d1 added code to the noise module... 2019-11-07 14:00:40 +00:00
ferben
a8f3a111a5 added Serial RNG - code compiles but not tested! 2019-11-07 13:45:38 +00:00
ferben
5c23abe507 commented on Notation 2019-11-07 11:57:40 +00:00
Michael Marshall
22c654182a Fixes for GPU compile 2019-11-04 17:24:34 +00:00
Michael Marshall
6f0439c0e4 Remove unnecessary cast 2019-11-04 15:50:14 +00:00
Michael Marshall
4f9a7c5d76 Back out unnecessary change 2019-11-02 16:50:29 +00:00
Michael Marshall
fcd90705bc Beautification 2019-11-02 16:15:48 +00:00
Michael Marshall
4bcdb4ff95 Remove accidental check-in of local debugging 2019-11-02 15:24:12 +00:00
Michael Marshall
1c10933db1 Rationalisation of NamedTensor (Perambulator) 2019-11-02 14:58:32 +00:00
Michael Marshall
52d8d576d0 Removed SliceShare as a reusable routine 2019-11-01 20:10:51 +00:00
Michael Marshall
ada0a7a83b C++11 case comparison of named tensor index names 2019-11-01 16:05:08 +00:00
Michael Marshall
efe2f2d48b Merge branch 'develop' into feature/distil
* develop:
  Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun
  Fixed Lanczos calling aligned alloc in threaded region hitting up against pointer-cache no-threading restrictions Fixed Lattice::reset not compiling with new Grid explicit memory region handling Fixed memory leak in Lattice::resize that occurs when data region has been previously allocated
2019-11-01 15:38:48 +00:00
Michael Marshall
45d4cf0971 Cleanup in progress 2019-11-01 15:35:07 +00:00
Peter Boyle
ac614cbc53 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-10-31 11:46:43 -04:00
Peter Boyle
ec8e060ec7 Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun 2019-10-31 11:46:09 -04:00
Felix Erben
5c54f27ac1 some cleanup, but hard-coded src in LapEvec unclear 2019-10-31 11:51:05 +00:00
Felix Erben
4ed9379535 some cleanup 2019-10-31 11:45:50 +00:00
Michael Marshall
858e348a6d Cleanup of messages 2019-10-31 11:11:52 +00:00
Michael Marshall
3b3680c64e Reversed Felix's interim A2Autils.h changes ... these were finished and went into develop via a separate branch 2019-10-30 15:50:04 +00:00
Michael Marshall
2a926b3dc6 Merged latest changes from develop, in preparation for release. 2019-10-30 14:52:34 +00:00
Chris K
845a045493 Merge pull request #233 from giltirn/lanczos_fix
A few run /compile / memory leak fixes
2019-10-30 10:21:59 -04:00
Michael Marshall
eb8848a071 Merge branch 'develop' into feature/distil
* develop: (27 commits)
  Update README.md
  result layout standardised, iterator size more elegant
  updated syntac in Test_hadrons_spectrum
  chroma-regression test now prints difference correctly
  baryon input strings are now pairs of pairs of gammas - still ugly!!
  second update to pull request
  Changing back interface for Gamma3pt
  Removing old debug code
  Changes to A2Autils
  suggested changes for 1st pull request implemented
  changed input parameters for easier use
  Should compile everywhere now
  changed baryon interface
  added author information
  ready for pull request
  code compiling now - still need to test
  Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!!
  thread_for caused the problems - slow for loop for now
  still bugfix
  weird bug...
  ...

# Conflicts:
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-10-30 14:13:00 +00:00
Peter Boyle
f31e3278a6 Update README.md 2019-10-25 11:43:55 -04:00
Michael Marshall
ca234325bc Fix single-precision error 2019-10-23 21:49:32 +01:00
c97f780784 Merge pull request #243 from fionnoh/feature/A2A_current_insertion
Feature/a2 a current insertion
2019-10-22 13:55:53 +01:00
Michael Marshall
78bdb0ff6a Grid 2019-10-20 14:22:45 +01:00
Michael Marshall
decab587a0 PerambFileName defaults to object name if empty 2019-10-20 14:14:06 +01:00
202f025fc7 Merge pull request #242 from mmphys/feature/baryons
Feature/baryons
2019-10-16 15:06:32 +01:00
Felix Erben
3c702b510b result layout standardised, iterator size more elegant 2019-10-15 18:48:51 +01:00
Michael Marshall
519ce19128 Fixes to enable GPU build. NB: Contractor and ContractorBenchmark still not working 2019-10-14 22:40:13 +01:00
Felix Erben
8d166a81c0 updated syntac in Test_hadrons_spectrum 2019-10-14 13:41:08 +01:00
Felix Erben
aa62ca9046 chroma-regression test now prints difference correctly 2019-10-10 11:07:20 +01:00
Felix Erben
2dee4791db baryon input strings are now pairs of pairs of gammas - still ugly!! 2019-10-09 17:56:09 +01:00
Felix Erben
548b3bf43c second update to pull request 2019-10-09 14:52:33 +01:00
Fionn O hOgain
a55d0ba8fe Changing back interface for Gamma3pt 2019-10-08 15:52:01 +01:00
Fionn O hOgain
5de9547db5 Removing old debug code 2019-10-08 15:51:28 +01:00
Fionn O hOgain
6a3b09cf02 Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion 2019-10-08 13:25:51 +01:00
Fionn O hOgain
10de4bfc23 Changes to A2Autils 2019-10-08 13:24:56 +01:00
Felix Erben
2ce7f2b4d8 suggested changes for 1st pull request implemented 2019-10-08 13:19:47 +01:00
Michael Marshall
88d6ff8f1d Peter's bugfix in ImplicitlyRestartedLanczos.h
My bugfix in MomentumPhase.hpp
2019-10-07 17:36:11 +01:00
Michael Marshall
803329af99 Merge branch 'develop' into feature/distil
* develop:
  Fix after GPU merge: Phase in Free Propagator
  z2-momentum phase module

# Conflicts:
#	Hadrons/Modules/MSource/MomentumPhase.hpp
2019-10-07 13:09:52 +01:00
Michael Marshall
9d96899aa8 Doc bugfix 2019-10-07 13:05:04 +01:00
Michael Marshall
86939dbf1a Removed unnecessary function (for getting a parameter) 2019-10-04 13:59:59 +01:00
317645aaeb undo (most) whitespace changes in the two files HMC/Mobius2p1fEOFA{,_F1}.cc 2019-10-02 16:25:23 +01:00
Felix Erben
e280ec6b0b changed input parameters for easier use 2019-10-02 16:14:06 +01:00
d5a180d914 Merge branch 'fix/removeQCDremnants' into fix/remQCDns_ignore_ws 2019-10-02 16:11:27 +01:00
d2928761dd Merge pull request #240 from guelpers/feature/bugfixafterGPUmerge
Fix after GPU merge: Phase in Free Propagator
2019-10-02 15:00:15 +01:00
f2a74c603f Merge pull request #239 from mmphys/z2_momentum
z2-momentum phase module
2019-10-02 14:57:59 +01:00
5f22810f55 Fix after GPU merge: Phase in Free Propagator 2019-10-02 14:49:35 +01:00
Michael Marshall
92e25488f8 Added MomentumPhase Hadrons module from z2_momentum branch (thankyou, Felix) so I can run Z_2 wall with momenta easily 2019-10-02 14:13:35 +01:00
Michael Marshall
89ef2b7dc2 Should compile everywhere now 2019-10-02 13:20:07 +01:00
Michael Marshall
7606554b76 Remove references to unused modules (now part of separate Baryons branch) 2019-10-02 13:16:58 +01:00
Felix Erben
c8fc0b3e0c changed baryon interface 2019-10-02 11:36:39 +01:00
Felix Erben
ccb5e8374b z2-momentum phase module 2019-09-30 17:36:15 +01:00
Felix Erben
b88fd436e7 added author information 2019-09-30 17:07:46 +01:00
Felix Erben
155bcd4ff3 ready for pull request 2019-09-30 16:58:20 +01:00
Fionn O hOgain
d1daab601a Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion
Peter's GPU branch changes merged with A2A CI code
2019-09-30 16:53:44 +01:00
Felix Erben
e5d7910fa7 code compiling now - still need to test 2019-09-30 13:55:26 +01:00
Felix Erben
94b9a9474c Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!! 2019-09-27 15:08:56 +01:00
Felix Erben
bf62ec163d thread_for caused the problems - slow for loop for now 2019-09-26 13:33:49 +01:00
Felix Erben
8415e23fc6 still bugfix 2019-09-26 11:09:09 +01:00
Felix Erben
76c93aa44e weird bug... 2019-09-17 14:36:26 +01:00
Michael Marshall
3137628222 BaryonUtils.h is now part of Baryons 2019-09-17 13:19:20 +01:00
Michael Marshall
ce965ee6bb Cleanup tests that are no longer required 2019-09-17 13:10:59 +01:00
Michael Marshall
911fbb0f36 Cleanup modules that are no longer required 2019-09-17 13:06:52 +01:00
Michael Marshall
eb293e9909 Restore Baryons modules per develop branch 2019-09-16 20:29:37 +01:00
Felix Erben
f548114ff6 bugfix 2019-09-16 17:55:58 +01:00
Felix Erben
dab8c01c3d added Baryon code 2019-09-16 17:20:54 +01:00
Michael Marshall
2f3dd0703d Ensure Distillation test (Test_distil) works 2019-09-16 17:00:46 +01:00
Michael Marshall
2e963d1a78 Fix location of Grid.h and remove reference to QCD namespace 2019-09-16 15:34:47 +01:00
Michael Marshall
bf52e7cc96 Latest BaryonUtils.h from Felix + my fixes 2019-09-13 18:11:10 +01:00
Michael Marshall
61d017d0a5 Merge GPU support (upstream/develop) into distillation branch.
This compiles and looks right ... but may need some testing

* develop: (762 commits)
  Tensor ambiguous fix
  Fix for GCC preprocessor/pragma handling bug
  Trips up NVCC for reasons I dont understand on summit
  Fix GCC complaint
  Zero() change
  Force a couple of things to compile on NVCC
  Remove debug code
  nvcc error suppress
  Merge develop
  Reduction finished and hopefully fixes CI regression fail on single precisoin and force
  Double precision variants for summation accuracy
  Update todo list
  Freeze the seed
  Fix compiling of MSource::Gauss for single precision
  Think the reduction is now sorted and cleaned up
  Fix force term
  Printing improvement
  GPU reduction fix and also exit backtrace option
  GPU friendly
  Simplify the comms benchmark
  ...

# Conflicts:
#	Grid/communicator/SharedMemoryMPI.cc
#	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
#	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
#	Grid/qcd/smearing/StoutSmearing.h
#	Hadrons/Modules.hpp
#	Hadrons/Utilities/Contractor.cc
#	Hadrons/modules.inc
#	tests/forces/Test_dwf_force_eofa.cc
#	tests/forces/Test_dwf_gpforce_eofa.cc
2019-09-13 13:30:00 +01:00
Michael Marshall
04a661cafe Remove unused modules BC2 and Baryon2 2019-09-10 14:49:24 +01:00
gfilaci
a7fa86dc29 MooeeInv improvement for DW EOFA + comments 2019-09-05 12:05:21 +01:00
gfilaci
0c1efa5235 pass OpenMP flag to host compiler 2019-09-03 12:12:25 +01:00
gfilaci
fdd9b14e82 speed up MooeeInvDag for DWF EOFA 2019-09-02 14:49:51 +01:00
gfilaci
e66669d300 fast MooeeInv for EOFA 2019-09-02 14:26:13 +01:00
gfilaci
0efaf3c4fa access M5D coeffs through pointers 2019-09-02 11:33:00 +01:00
gfilaci
3ef519aaa4 fast MooeeInv 2019-09-02 11:18:14 +01:00
Peter Boyle
b473405652 Tensor ambiguous fix 2019-08-29 09:36:41 -05:00
Christopher Kelly
114ebb7914 Fixed Lanczos calling aligned alloc in threaded region hitting up against pointer-cache no-threading restrictions
Fixed Lattice::reset not compiling with new Grid explicit memory region handling
Fixed memory leak in Lattice::resize that occurs when data region has been previously allocated
2019-08-26 16:47:44 -04:00
Peter Boyle
9b7a6d197f Fix for GCC preprocessor/pragma handling bug 2019-08-23 14:37:46 +01:00
Peter Boyle
59cd7f3b70 Trips up NVCC for reasons I dont understand on summit 2019-08-23 06:03:49 -04:00
Peter Boyle
28d6be2a4e Fix GCC complaint 2019-08-22 18:56:37 +01:00
6b6c5aa626 remove namespace QCD from directory tests 2019-08-20 15:35:36 +01:00
9210b0aa6e remove namespace QCD from directory HMC 2019-08-20 15:21:23 +01:00
ad01290545 remove remnants of the namespace QCD 2019-08-19 20:30:33 +01:00
Fionn O hOgain
25150eb2e0 3pt contraction now takes a list of gammas 2019-08-15 12:09:30 +01:00
Peter Boyle
95f66cc93c Merge branch 'feature/gpu-port' into develop 2019-08-15 02:19:31 +01:00
Fionn O hOgain
d566637cec Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion 2019-08-07 12:11:40 +01:00
Felix Erben
51bed48cd2 added selfcontract module 2019-08-05 17:46:42 +01:00
Felix Erben
b875edceab Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil
Conflicts:
	Grid/qcd/utils/BaryonUtils.h
	Hadrons/Modules/MContraction/Baryon2.hpp
2019-08-05 14:19:43 +01:00
Felix Erben
29df60c0cb some debugging stuff 2019-08-05 14:10:04 +01:00
Michael Marshall
8d97e2a02a Say which A2AMatrix is being loaded, and which contraction is being performed (m of n) 2019-08-02 19:23:18 +01:00
Michael Marshall
ed23f6be20 Remove blank line from log 2019-08-02 15:59:18 +01:00
Michael Marshall
cad76827b0 Be consistent about separator usage. Log start / stop / duration 2019-08-02 15:47:20 +01:00
Michael Marshall
310867d46a Additional option to specify the separator used between terms in correlator 2019-08-02 11:25:29 +01:00
Michael Marshall
e598178d94 TODO: Felix, please fix. I commented this out because of compiler errors 2019-08-01 20:51:51 +01:00
Michael Marshall
723457d467 Contractor updates ready for test on Tesseract:
1) Move definitions of serialisable objects into header for re-use by external programs/utilities
2) Add "-s" switch for "Simple" correlators, i.e. only include A2AMatrix info for the actual fields included in each contraction
2019-08-01 20:35:55 +01:00
Michael Marshall
6f40021842 Fixed compiler errors: TODO: Felix, please validate 2019-08-01 19:57:59 +01:00
Felix Erben
622d5eaa3e Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-07-30 13:47:22 +01:00
Felix Erben
e66d48c142 second way to compute baryons - qdp style 2019-07-30 13:46:59 +01:00
Michael Marshall
f5ad4f3de8 Added the ability to write a version of the validated XML file excluding any of the module IDs supplied in a separate exclude file 2019-07-26 19:46:55 +01:00
Michael Marshall
e7050a7aed Support gamma structure names that have trailing white space 2019-07-19 11:58:56 +01:00
Felix Erben
e138bc7204 debug output 2019-07-19 11:16:35 +01:00
Michael Marshall
6d4fb35d84 Ready for testing 2019-07-19 10:33:03 +01:00
Felix Erben
56cefadf9b gamma matrices as input 2019-07-18 17:46:43 +01:00
ferben
9d82855c5d bugfix in Baryonutils 2019-07-18 15:45:43 +01:00
ferben
97d61f2564 bugfix in Baryonutils 2019-07-18 14:57:10 +01:00
ferben
11a8668d19 bugfix in Baryonutils 2019-07-18 14:44:55 +01:00
ferben
cded7670d0 new utils for baryons 2019-07-18 14:29:04 +01:00
ferben
feb029fb66 new utils for baryons 2019-07-18 14:24:16 +01:00
Felix Erben
5a62ebe7b1 general baryons case added 2019-07-15 15:26:30 +01:00
Michael Marshall
fa747173d1 Debugging references were to l-values, so added const to stop errors 2019-07-14 11:08:00 +01:00
Michael Marshall
12afb0395f Debugging transposeSpin - seems just not to be implemented for Lattice<x> 2019-07-11 17:42:26 +01:00
Felix Erben
ec4aa978ab why cant I spinTranspose 2019-07-11 14:01:41 +01:00
Michael Marshall
7bc4a06f3f This is probably what you want ... 2019-07-10 12:29:33 +01:00
Michael Marshall
cd659525e1 You probably want to add this to the build. And you may need to do a bootstrap 2019-07-10 12:08:37 +01:00
Felix Erben
dc2240d2d8 why does sliceSum in Nucleon.hpp not work 2019-07-10 11:34:16 +01:00
Felix Erben
98cf20cf06 continued work on baryons 2019-07-09 17:42:36 +01:00
Felix Erben
cc3346073e continued work on baryons 2019-07-09 17:30:32 +01:00
Felix Erben
3848da7c50 added nucleon module (non-distillation) 2019-07-08 17:43:14 +01:00
Felix Erben
b7d0cf6751 buxfix in diquark sum / baryons 2019-07-04 22:06:37 +01:00
Felix Erben
2c1a077369 continued on baryons 2019-07-02 17:55:28 +01:00
Michael Marshall
ae3abbe53d Added the ability for Perambulator module to save unsmeared sinks through the addition of two optional parameters:
UnsmearedSinkFileName: If present, specifies the filename to write to
UnsmearedSinkMultiFile: defaults to true to write each sink vector to a different file, but can be set to 0 for a single file
2019-07-01 17:28:27 +01:00
Felix Erben
5fc0188205 started saving sinks 2019-07-01 14:51:59 +01:00
fionnoh
67690df3bd Changes nedded to have a current insertion on every second time slice - avoids unnecessary contractions 2019-06-28 15:18:28 +08:00
fionnoh
ce29b18dc9 New modules for loading in MFs as diskvectors and producing propagaotrs from 4 quark contractions 2019-06-27 13:46:06 +08:00
fionnoh
421a0a8a36 Changes to A2Autils, A2AMatirx and DiskVector code that is needed for Hadrons 4 quark contraction module 2019-06-27 13:45:20 +08:00
fionnoh
ac530636ca A2Aloop bugfix 2019-06-27 13:44:47 +08:00
Michael Marshall
2d940a598c Inserted four extra parameters just to make this test compile. Needs to be fixed properly 2019-06-19 10:37:50 +01:00
Michael Marshall
c28c5fc61b Inserted four extra parameters just to make this test compile. Needs to be fixed properly 2019-06-19 10:31:41 +01:00
Michael Marshall
015340d60c Elided superfluous copy on write 2019-06-19 09:37:03 +01:00
Michael Marshall
9a8a63467e BC2 now runs. setup() runs twice, which had resulted in doubling up of momenta. Also fixed initialisation of momentum phases. 2019-06-12 15:25:59 +01:00
Michael Marshall
fe72dc099b Upgrade to Mojave forced me to reinstall MacPorts. These are the ports I installed to get Grid working 2019-06-04 16:12:24 +01:00
Michael Marshall
54edb9906e Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h> 2019-06-03 15:20:46 +01:00
Michael Marshall
9ff459816f ReadBinary needs to do case insensitive name comparison (since I changed the default case of perambulator column names) 2019-06-01 13:50:27 +01:00
Michael Marshall
eb737daeb5 Merge branch 'develop' into feature/distil
* develop: (34 commits)
  Hadrons: EMLepton: Wall source
  Revert "cleaning up Kl2 contraction"
  cleaning up Kl2 contraction
  posibility to save/load schedules directly from the application parameters
  moving VERSION file to the empty ChangeLog one, this create compilation problems with #include <version> in recent versions of LLVM and case-insensitive FS (typically macOS)
  Added precision tuning to Hadrons parameterfile writing
  Kl2 QED cleanup
  Added ZFIMPL to SeqGamma
  Added ZFIMPL to SeqConserved module
  F1 ensemble running with 96%~ acceptance etc..
  Make detection of HPE 8600 automatic
  Added variables that were missing from wall source setup
  Exposed a coulomb/landau enum to the gauge fixing module
  Coulomb gauge added as an option
  More logging, timing, and 4d/5d logic for eigpack gauge transforms
  Added gauge transform option to eigpack IO
  Hadrons: Lepton Propagator for kl2, sign swap for antiperiodic boundary
  A2A Lepton-Meson Field contraction
  Verbose
  Iteratoin range fix
  ...
2019-05-31 18:20:43 +01:00
Felix Erben
8ce7ebdca3 fixed contraction issue 2019-05-17 10:52:55 +01:00
Felix Erben
435653490e fixed contraction issue 2019-05-17 10:50:15 +01:00
Michael Marshall
10a052d695 3 issues preventing compilation under clang. Marked these with FELIX_ISSUE and made minimal change to make compile (as fix not obvious) 2019-05-17 09:59:01 +01:00
Felix Erben
acd5a01b65 some work on baryons 2019-05-16 15:11:50 +01:00
Michael Marshall
ec7d96ce3b Merge branch 'develop' into feature/distil
* develop:
  Hadron WeakEye and A2ALoop bug fixes, and WWVVContraction bug fix
  DiskVector: fix of memory bug triggering segfault when the cache is accessed following a certain pattern
  MFermion::GaugeProp fix for 4d fields
2019-05-14 13:10:40 +01:00
Michael Marshall
c16916cc45 Multiple local slice fixes 2019-05-06 10:35:42 +01:00
Michael Marshall
a865caf0d2 Forgot a const in IndexName only version of NamedTensor constructor 2019-05-03 22:17:25 +01:00
Michael Marshall
9ae4d369f3 Use the definition of the Perambulator Index names given in Hadrons::MDistil 2019-05-03 22:00:50 +01:00
Michael Marshall
ec24a1f828 Fixed 2 bugs in LapEvec: 1) InsertLocalSlice 2) ensure convergence assertion stops entire machine 2019-05-03 16:03:56 +01:00
Michael Marshall
0efe63f6fa 3D smearing fix 2019-05-02 19:37:59 +01:00
Michael Marshall
b7ead6c16a Fixed bug: iff stout smearing disabled then gauge field uninitialised 2019-05-02 18:20:49 +01:00
Michael Marshall
62692b68b9 I'd forgotten that Intel '17 doesn't like auto var{value}; syntax 2019-05-01 20:45:16 +01:00
Michael Marshall
311c35a15c Looking for fixes for Intel '17 compiler errors. std::cout << complex number ? 2019-05-01 18:22:08 +01:00
Michael Marshall
a3fe57f430 NamedTensor writes to tag NamedTensor by default (not filename) - so still usable in case user renames file.
Also tweaked tensor index name checking (which is used to ensure tensor is correct type)
2019-05-01 18:11:37 +01:00
Michael Marshall
8dc0587621 Post Michael / Felix review. Ready for Peter / Antonin review 2019-05-01 13:04:51 +01:00
Michael Marshall
cfe5fa7a35 1) Don't write Laplacian eigenvectors to disk 2) Add a test that loads perambulators from disk 2019-05-01 09:50:23 +01:00
Michael Marshall
e72e26c899 Get rid of unnecessary multiFile options 2019-05-01 08:53:08 +01:00
Michael Marshall
334f29becb Fairly close to ready for release. Felix and I to review, then submit for release 2019-04-30 23:53:57 +01:00
Michael Marshall
e56ead55ef WIP 2019-04-30 14:41:48 +01:00
Michael Marshall
d74d443d1b Pre-release cleanup in progress 2019-04-29 22:18:29 +01:00
Michael Marshall
4203105104 Part-way through release tidy-up 2019-04-29 18:40:38 +01:00
Michael Marshall
ac19c0e04f This will need to be removed eventually, but should save us fiddling about with each release 2019-04-29 09:20:08 +01:00
Michael Marshall
b48ca8a6ef Merge branch 'develop' into feature/distil
* develop: (36 commits)
  Mobius 2+1f sign off.
  Integrator logging on by default
  RHMC for mobius
  HMC make file
  Update
  Simple check
  Simple checks
  Monius HMC
  Changes locally
  Power method
  Momentum rescaling
  Bounds checking
  Bounds checking
  Scale momentum convention to CPS/UKQCD MD time
  Add bounds checking
  Updated documentation after Peter's review. 1) Removed version numbers from Grid dependencies 2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
  Remove bundled Eigen stuff
  Fix typo so it matches develop
  Remove bundled source from my local repository
  Slightly generalize interface to SchurRedBlackBase and derived solver classes so we can pass forecasted initial guesses in EOFA heatbath correctly
  ...
2019-04-29 08:37:39 +01:00
Michael Marshall
c48ae4f3ad 1) Only the boss should write the perambulator - possibly was a source of intermittent corruption?
2) Implemented and test a perambulator conversion utility in Test_distil (commented out near the start of main)
2019-04-28 23:24:57 +01:00
Michael Marshall
fb74de0798 Making sure Hdf5 is an optional dependency (default to binary writer if not present) 2019-04-28 20:23:44 +01:00
Michael Marshall
adc1eaee68 Switched to Hdf5 format for perambulators. Ready for first test on Tesseract. 2019-04-28 17:53:42 +01:00
Michael Marshall
5aca4e8670 Just realised that the trace is at every lattice site, so moved the check for no smearing further up 2019-04-26 17:23:18 +01:00
Michael Marshall
e223d0b99f Need to validate range about which exp^iQ is considered unity 2019-04-26 16:00:35 +01:00
Michael Marshall
2e220456d3 First attempt at minimising smearing 2019-04-26 15:54:05 +01:00
Felix Erben
4333d97958 fixed parameter 2019-04-26 14:29:21 +01:00
Felix Erben
55c9c45d4b Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-04-26 14:28:01 +01:00
Felix Erben
e70e03f560 started stout smearing for small w 2019-04-26 14:27:40 +01:00
Michael Marshall
ff5e2e0f47 Debug output fix. Meant to print the rho matrix for stout smearing ... not the address of the function that creates it 2019-04-26 12:30:41 +01:00
Michael Marshall
4f3d1ea6e8 Two heads are better than one. Combined effort and hopefully spatial smearing now fixed! 2019-04-26 12:18:11 +01:00
Author Name
b1768ba820 Urgh! 2019-04-26 10:04:27 +01:00
Michael Marshall
3ac5a69a57 Ready to test spatial smearing (again) 2019-04-26 08:54:30 +01:00
Michael Marshall
50a74eaea3 Doesn't compile. Does it still need to be maintained? 2019-04-26 08:33:10 +01:00
Michael Marshall
8419fbb335 Renamed PerambLight module. Check with Felix whether Test_24 and Test_tesseract still need to be maintained 2019-04-26 08:23:15 +01:00
Michael Marshall
23a9b93cda More dependencies for Distil.hpp move and (C) 2019 only 2019-04-26 07:39:05 +01:00
Michael Marshall
ecdc3ddebf Moved Distil.hpp and added GNU license to all files 2019-04-26 07:24:56 +01:00
Michael Marshall
606698511c Seems we've not been keeping the test up-to-date 2019-04-22 19:03:24 +01:00
Michael Marshall
a97b814f0c Remove redundancy in LapEvec filename 2019-04-19 14:09:36 +01:00
Michael Marshall
7214681e11 Spatial smearing doesn't work yet. Fixed inconsistency in naming of perambulator in PerambLight.hpp 2019-04-19 13:54:25 +01:00
Michael Marshall
143b75956c Stout smearing 3D fixes. Changed LapEvec to perform spatial smearing only 2019-04-19 11:54:02 +01:00
Felix Erben
4a4203c610 fixed stout smearing for now 2019-04-18 19:10:49 +01:00
Felix Erben
2b598294c9 added distil source module 2019-04-18 17:47:09 +01:00
Michael Marshall
d111c70c38 Merge branch 'develop' into feature/distil
* develop:
  Make sure Grid::Serializable can write Eigen Tensors to output streams. NB: 1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember 2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
  ... this time without the new Distillation modules ...
  Eigen tensor serialisation fixes after Antonin's review
  Iterator added. Will wait for review comments before finalising.
  Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions. E.g. assuming T::rank is an int, then objects defined like so:     const auto rank{T::rank}; should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
  Pushed paboyle's changes: Updates for clang happy
  Merge paboyle's no compile in single precision Intel 2019 fix
  Eigen::Tensor serialisation. Tested on single and double precision builds
2019-04-10 13:14:24 +01:00
Michael Marshall
ed2427d5f7 Make sure Grid::Serializable can write Eigen Tensors to output streams. NB:
1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember
2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
2019-04-06 15:37:53 +01:00
Michael Marshall
ea2f34de7b Updated documentation after Peter's review.
1) Removed version numbers from Grid dependencies
2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
2019-04-06 13:37:47 +01:00
Michael Marshall
63dc0fa7e9 Fixed memory leak ... without breaking semantics of prior code. Possibly should change the semantics? For Peter / Antonin to comment 2019-04-04 16:00:17 +01:00
Felix Erben
5e6104e683 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-04-04 12:13:35 +01:00
Felix Erben
25e4ee3a49 3D Stout smearing added 2019-04-04 12:13:16 +01:00
Michael Marshall
4161429dcc Serialisation fixes after Antonin's review 2019-04-03 22:30:07 +01:00
Michael Marshall
b5eb97206b Merge branch 'develop' into feature/distil
* develop:
  MGauge::GaugeFix use standard convention for fields
  fix bug: MGauge::GaugeFix should not modify its input
  add gauge transformation matrix as output to module MGauge/GaugeFix
2019-04-03 16:24:49 +01:00
Michael Marshall
0da906cf66 Merge branch 'develop' into feature/distil
* develop:
  Documentation for using Grid with Xcode on Mac OS
2019-03-27 23:08:29 +00:00
Michael Marshall
3decb5f886 Merge branch 'develop' of github.com:paboyle/Grid into feature/distil
* 'develop' of github.com:paboyle/Grid:
  endianness fix in resilient IO
2019-03-27 20:39:23 +00:00
Michael Marshall
faa8bb9bc6 Fixed funny memory leak 2019-03-27 17:55:52 +00:00
Michael Marshall
4c02ed6d0c Updated GridXcode documentation 2019-03-27 13:54:39 +00:00
ferben
f757b80e1c tried to fix mem leak 2019-03-27 12:00:36 +00:00
ferben
b8581be1da : 2019-03-27 11:59:06 +00:00
Michael Marshall
9fce1263be Fixed bug in LapEvec if machine running spread-out in time 2019-03-26 13:24:39 +00:00
Michael Marshall
ae565b006a Compiling in single-precision now works 2019-03-25 22:56:01 +00:00
Michael Marshall
8502660023 Begin fixes for single precision 2019-03-25 20:40:05 +00:00
Michael Marshall
625a97a466 cosmetic 2019-03-25 18:16:04 +00:00
Felix Erben
bce2766fef Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-25 16:38:42 +00:00
Felix Erben
ce501afec6 bugfix 2019-03-25 16:38:25 +00:00
Michael Marshall
1d10a3b3de Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  bugfix
2019-03-25 15:50:57 +00:00
Michael Marshall
d1e02f50ff Added iterator for Eigen tensors 2019-03-25 15:50:29 +00:00
Felix Erben
48b03c4590 bugfix 2019-03-25 15:45:35 +00:00
ferben
b3b9e608e1 added new module for noises 2019-03-25 14:13:03 +00:00
Michael Marshall
4e87cbd400 Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions.
E.g. assuming T::rank is an int, then objects defined like so:
const auto rank{T::rank};
should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
2019-03-23 09:28:41 +00:00
ferben
4fc045b563 added module to load perambulators from disk 2019-03-22 13:50:47 +00:00
ferben
fbf286b0e3 added Spin dilution 2019-03-22 13:30:11 +00:00
Michael Marshall
9dc3fe9922 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  modules list
2019-03-22 13:00:06 +00:00
Felix Erben
6c9029fab7 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-22 12:41:56 +00:00
Felix Erben
8700dd4d0d modules list 2019-03-22 12:41:53 +00:00
Michael Marshall
9c16391e55 Merge branch 'develop' into feature/distil
* develop:
  Updates for clang happy
2019-03-22 12:08:50 +00:00
Michael Marshall
685d9bafef Merge branch 'develop' into feature/distil
* develop:
  No compile in single precisoin Intel 2019 fix
2019-03-21 16:36:48 +00:00
Michael Marshall
d2d26b302d Removed the module we don't need from modules.inc (so make now works)
i.e. removed Modules/MDistil/PerambMultipleSolves.hpp from Hadrons/modules.inc
2019-03-20 22:59:20 +00:00
Michael Marshall
88cb004731 Fixed single-precision issues in Test_serialisation 2019-03-20 22:05:16 +00:00
ferben
a66bb8acba fixed possible memory leak 2019-03-20 14:41:36 +00:00
ferben
4ae35000a9 removed module which we do not need 2019-03-20 13:36:57 +00:00
Michael Marshall
02b96b4602 Fixed module list (messed up when I merged from develop) 2019-03-20 11:20:40 +00:00
Michael Marshall
11dded61e8 Merge branch 'develop' into feature/distil
* develop: (29 commits)
  precision fix
  Updates after review with Peter.
  Wilson clover multi grid for lime lattice
  Recommendations for Traits classes
  Hadrons: uninitialised pointer fix (might have been harmless)
  Hadrons: beware of the nasty uninitialised twists
  Smearing test. Test on free field.
  Smearing for quark observables
  Smearing
  Hadrons: XML validator utility
  display relative norm during field IO norm check
  possibility to set a build number
  IO norm check on relative norm
  Output field norm check during IO
  Hadrons: random vector utility module I/O
  quieter initialisation
  fix patch command for eigen in bootstrap.sh
  Mres changes and gauge xform mat changes
  Hadrons: 32 bit I/O directly in Lanczos module
  Hadrons: copyright update
  ...

# Conflicts:
#	Grid/tensors/Tensor_traits.h
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-03-20 10:35:36 +00:00
Michael Marshall
24cf3b9df5 Ignore Version.h as it's created by automake/autoconf 2019-03-19 12:12:39 +00:00
Michael Marshall
9c8aa2047d Put GridXcode doc in subdirectory 2019-03-19 07:33:19 +00:00
Michael Marshall
204cfa1c5a Added documentation for Grid using Xcode 2019-03-19 07:28:29 +00:00
Michael Marshall
fe6845d38b Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-18 14:44:18 +00:00
bff4eeec41 Added disclaimer on half-precision types 2019-03-18 12:15:25 +00:00
Felix Erben
d1fe4dce33 new idea to get multiple perambulators 2019-03-15 10:28:02 +00:00
ferben
50ca3101de bug in multiSolves and new test prog 2019-03-13 17:25:55 +00:00
ferben
0faf40e207 last commit did not compile - fxied this 2019-03-13 13:24:18 +00:00
ferben
5313e44d11 some cleanup 2019-03-13 13:15:12 +00:00
ferben
6bb9b67c93 externalised gauge field reading to hadrons module 2019-03-13 12:09:12 +00:00
a0405c6d84 PerambMultipleSolves.hpp compiles (not had time to test) 2019-03-12 14:01:29 +00:00
ferben
c2a3231cdf added testing module for multiple perambulators 2019-03-11 18:05:39 +00:00
ferben
5fb2ee89bb modified test so that it runs 2019-03-08 16:50:21 +00:00
ferben
608a98d870 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-08 16:28:34 +00:00
ferben
2df396380d solver is now external 2019-03-08 16:28:21 +00:00
Felix Erben
64ba664637 changed debug options 2019-03-08 12:25:00 +00:00
ferben
4a70b2ffd4 Aslash insertions work now? 2019-03-08 12:23:22 +00:00
2d659015ff Serialisation is fully functional. Ready for review. 2019-03-08 00:30:43 +00:00
e63019ac50 Tensor serialisation is fully functional 2019-03-08 00:01:45 +00:00
Felix Erben
dde118fed9 added everythong to compute sequential aslash fields 2019-03-07 17:36:53 +00:00
Felix Erben
1538bf8c34 added everythong to compute sequential aslash fields 2019-03-07 17:36:22 +00:00
Felix Erben
4abc498ae3 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-07 15:34:10 +00:00
Felix Erben
93dfbfbfcd added module to compute perambulator from a solve 2019-03-07 15:33:50 +00:00
f9e273d4bf Making sure same as Traits-recommend 2019-03-07 14:33:04 +00:00
584fa0a633 Changes after review with Peter 2019-03-07 12:53:34 +00:00
ferben
73cdca3973 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-06 13:55:51 +00:00
ferben
d716f8a0c9 new module for baryon contraction 2019-03-06 13:55:36 +00:00
aa24f04911 Changed EigenIO to use GridTypeMapper type traits 2019-03-06 12:55:05 +00:00
1880e6d12d Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-06 11:16:34 +00:00
4a00513e65 Moving Eigen trensor utilities to separate (optional) header 2019-03-06 11:16:22 +00:00
ferben
7718ee199a efficient baryon test program 2019-03-05 17:16:42 +00:00
ferben
d7c7bff065 added output for source meson fields on all tsrc 2019-03-05 12:01:55 +00:00
ferben
802675f062 baryons should compile now... 2019-03-04 17:31:21 +00:00
d56d8c923f Replaced an error in A2AUtils.h that was stopping the build with an assert() 2019-03-02 00:36:53 +00:00
00c3c6fc54 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-02 00:24:47 +00:00
b3d4ba8657 Fixed issues with Eigen Tensor serialisation. Fixed issues with precision to text streams 2019-03-02 00:24:37 +00:00
Felix Erben
a4d578bd5d baryons work now??? 2019-03-01 14:44:39 +00:00
Felix Erben
7653649389 baryons working now 2019-03-01 12:57:41 +00:00
a344a2227e Fixing build errors 2019-02-28 20:30:16 +00:00
4b9200b35c Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-28 19:06:36 +00:00
91be028507 Still one issue on write 2019-02-28 19:06:25 +00:00
3b05f91f5c Prototype for template traits recommendations 2019-02-28 19:04:44 +00:00
Felix Erben
8804271339 efficient baryons compile! 2019-02-28 16:32:40 +00:00
6d9f377913 added parity 2019-02-28 11:05:31 +00:00
18b603c5ae simple but hopefully efficient baryon field 2019-02-28 10:27:05 +00:00
ferben
e9784572af baryons... 2019-02-27 17:51:25 +00:00
Felix Erben
f168a9e7ee continued with baryons 2019-02-26 16:41:52 +00:00
Felix Erben
50b6db75da Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-02-26 15:57:09 +00:00
Felix Erben
df065f1d57 first test configs 2019-02-26 15:57:01 +00:00
578eb177e7 Tweaked format and memory use on Xml format. Still crashes (out of memory) on large read on my laptop 2019-02-25 22:03:21 +00:00
81b3f3d2ca Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 15:39:07 +00:00
7c7ffa3b10 Added text read/write 2019-02-25 15:38:47 +00:00
ferben
1f098ceecf Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 15:36:23 +00:00
ferben
c47c1a2472 started working on baryons - this time efficiently 2019-02-25 15:36:11 +00:00
ec45b16840 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 14:10:34 +00:00
9288019789 Added Xml IO (has one deficiency: the format for multi-dimensional data is flat) 2019-02-25 14:10:24 +00:00
ferben
9c04139362 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 12:40:44 +00:00
ferben
cfc14a7432 more adjustments to test 2019-02-25 12:40:32 +00:00
31e40c26fa Oops. Forgot to delete SortNode (prevented linking) 2019-02-25 11:35:33 +00:00
ferben
3f2fe5c7e7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 11:18:34 +00:00
ferben
76b6e8a01e first tesseract test 2019-02-25 11:18:25 +00:00
f9543982e4 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 11:07:43 +00:00
3c9f2d4106 Chunking layout reasonably efficient. Looks for small prime factors of each dimension, falling back to approximate size if needed. 2019-02-25 11:07:29 +00:00
ferben
cad26a736e quick&dirty fix for g5*field 2019-02-22 17:05:16 +00:00
ferben
4f2ac433f1 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-22 16:31:26 +00:00
ferben
f9e505108b test Aslash 2019-02-22 16:31:17 +00:00
Felix Erben
d2aced13da Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-02-22 16:30:40 +00:00
Felix Erben
03d031d623 tesserct test 2019-02-22 16:30:22 +00:00
44a2d4854a Ensured Hdf5 chunk size always less than 4GB 2019-02-22 15:14:32 +00:00
292ff33f7f Removed issue with std::string_literal 2019-02-21 16:51:05 +00:00
55886cf9db ran make_module_list.sh 2019-02-21 16:14:13 +00:00
c640923159 Fixed reference to depth from test 2019-02-21 15:48:52 +00:00
752530f352 Gotten rid of c++17 in Test_serialisation.cc 2019-02-21 14:43:07 +00:00
34b9450fc9 Gotten rid of c++17 2019-02-21 14:22:48 +00:00
ferben
5d6462b706 bugfix 2019-02-21 11:13:10 +00:00
f70c5b004a some cleanup in Baryon2pt 2019-02-20 12:56:13 +00:00
5bb9de9242 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-19 17:37:37 +00:00
982a24514b Binary IO also implemented and tested 2019-02-19 17:37:21 +00:00
ferben
97c6f770b4 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-19 17:22:41 +00:00
ferben
4522f1e831 separated final 2pt Contraction 2019-02-19 17:22:30 +00:00
c14547ddbe EigenIO writing rationalised. All indices (trivial or not) written 2019-02-19 16:12:55 +00:00
63c97db414 Prior to rationalising 2 versions of BaseIO::write (scalar and vector) 2019-02-19 13:29:08 +00:00
6ebb32ffbf Rationalised Test_serialisation 2019-02-18 21:40:53 +00:00
07c97cb424 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-18 17:12:36 +00:00
04b58de5de Read-back working. 2019-02-18 17:12:27 +00:00
ferben
6e822b7201 added sign for contraction sum 2019-02-18 15:21:13 +00:00
ferben
625ccfcd72 continued baryon contraction code 2019-02-18 13:10:34 +00:00
c77069244d Nearly ready. Just finishing off readback and compare 2019-02-18 08:55:50 +00:00
9815ddb853 Started read routines. Introduced readMultiDim and tested I didnt break anything 2019-02-16 19:30:33 +00:00
74a3a5b825 Fixed existing bug in Hdf5Reader::readDefault for std::vector<U> 2019-02-16 18:45:46 +00:00
00e9416e0a Tweak to initialisation example 2019-02-16 17:08:22 +00:00
b6803a070a Making sure I understand row-major vs column-major ordering 2019-02-16 16:18:28 +00:00
ferben
bfd2770657 started on baryon flavour sums 2019-02-15 15:51:46 +00:00
ferben
668b1e77c7 small changes 2019-02-15 15:31:53 +00:00
ferben
e51744260f Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 14:32:26 +00:00
ferben
e0987d7d81 first contraction version done 2019-02-15 14:32:17 +00:00
26b94d7bda Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 13:53:00 +00:00
df0c8b5d84 Test of Eigen slices 2019-02-15 13:52:49 +00:00
ferben
a111d814db Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 10:47:41 +00:00
e8bd8767c0 Get rid of declarations inside constexpr functions. if constexpr warning remains 2019-02-15 10:06:15 +00:00
8cb96cb693 Hmmm lots of warnings depending on compiler ... 2019-02-14 19:17:12 +00:00
b9bee45277 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-14 19:05:47 +00:00
bee24655cd Finalising traits 2019-02-14 19:05:35 +00:00
886c895f81 baryon field structure is now eigentensor - started on contractions for 2pt functions 2019-02-14 16:44:54 +00:00
59c8cc1588 Minor bugfix 2019-02-13 22:11:24 +00:00
11467a994d Enough for tonight 2019-02-13 21:48:35 +00:00
ferben
9f2ca98dfc enseble can now be specified in LapEvec 2019-02-13 13:54:31 +00:00
bf434b6bef Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-13 12:14:18 +00:00
41ff592515 Moved serialisation tests into Test_serialisation 2019-02-13 12:14:01 +00:00
ferben
48ec937c55 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-13 11:48:57 +00:00
ferben
65731546b7 merge... 2019-02-13 11:48:34 +00:00
76c6a6772a Added rank_non_trivial 2019-02-12 22:15:55 +00:00
e7048231bc Working version with additional Grid traits pre: review by Antonin 2019-02-12 13:59:48 +00:00
49babeab19 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-11 23:26:46 +00:00
fb2cb3015e Writing of Eigen::Tensor of grid objects now works (for Hdf5) 2019-02-11 23:26:18 +00:00
ferben
53f45d2c7e Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-11 17:39:55 +00:00
ferben
d889cebc60 unique string is now used 2019-02-11 17:39:42 +00:00
9a225235b6 Can write both fixed and dynamic sized tensors (small tidy) 2019-02-11 17:15:38 +00:00
dff7d9261d Can write both fixed and dynamic sized tensors 2019-02-11 15:47:40 +00:00
6f2663edf6 Serialisation of an object containing an Eigen::Tensor works for Hdf5. Still quite a lot of tidying up to do. 2019-02-10 23:19:20 +00:00
d5024bd07e Hdf5 writing of scalar (i.e. no Grid subtypes) Eigen::Tensor works. But issues when adding Eigen::Tensor to serialisable object. 2019-02-10 15:33:16 +00:00
9c4189484a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-09 17:12:43 +00:00
3720103f41 Adding Eigen::Tensor still WIP 2019-02-09 17:12:36 +00:00
ferben
c4d27ee30f added parity operator to baryon fields 2019-02-08 15:49:52 +00:00
ferben
d26a5dce12 bugfix 2019-02-08 14:37:09 +00:00
ferben
5843a943d9 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-08 14:36:08 +00:00
ferben
c1341b8ed2 bugfix 2019-02-08 14:33:06 +00:00
6a4515d0cd baryons have now the correct (?) structure - also easier! 2019-02-07 12:27:57 +00:00
a0a39e4b00 Fixed initialisation of vector of Complex 2019-02-06 21:56:44 +00:00
b9fb16077c Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 21:37:54 +00:00
4b3c566c89 ../tests/hadrons/Test_hadrons_distil.cc 2019-02-06 21:36:46 +00:00
ferben
cbd2dfe53f Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 12:56:49 +00:00
ferben
6cdb1eb62c BContraction now computes what might be a baryon function, but probably isn't 2019-02-06 12:23:52 +00:00
ed7175076b Turned off warning of unused variable line 150 2019-02-06 09:32:13 +00:00
27677b3870 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 09:25:39 +00:00
7423f5af1a Examples of how to access Grid Tensors 2019-02-06 09:25:24 +00:00
ferben
21d6dbe0b6 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-05 17:32:39 +00:00
ferben
1ee84509b5 added baryons project - not working yet 2019-02-05 17:32:26 +00:00
57e57d162f Removed Eigen::DontAlign attribute 2019-02-05 12:50:28 +00:00
5b0870bb19 Added Scalar_ length and Scalar_Unit_Size to Perambulator file for validation 2019-02-05 09:07:05 +00:00
7f5354630a Updated perambulator binary format to save payload in big endian format on disk 2019-02-04 23:07:59 +00:00
008ac6b5ae Permabulator is read back from disk if it exists instead of being created 2019-02-04 12:06:32 +00:00
c7aa4e0c1f Perambulator filename can be specified in xml. NB: Perambulator binary format now includes data size in bytes to avoid type mismatches. 2019-02-04 11:30:30 +00:00
43bd918a47 Logging tweak 2019-02-03 21:48:50 +00:00
7eda54bb87 Only write indices with dimesion!=1 2019-02-03 20:58:58 +00:00
bd75b843fa Added checksum to data 2019-02-03 20:31:42 +00:00
8865bf5d7c Implemented perambulator read/write ... but in binary format. Will switch to Hdf5 when I have Antonins feedback 2019-02-03 17:05:19 +00:00
ferben
caabbcd951 minor change 2019-02-01 17:50:18 +00:00
48528c5b1d Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
Added index names to Perambulator
2019-02-01 15:31:27 +00:00
f7b90a0c14 Added index names to perambulator 2019-02-01 15:20:35 +00:00
ferben
a9848becb0 unsmeared sinks can now be computed - new test program available 2019-02-01 13:23:42 +00:00
7cc13f48d5 added some TODO comments; needs discussion 2019-01-31 16:54:11 +00:00
b6b267fd4b Fixed new test parameters 2019-01-31 15:11:12 +00:00
9671a61bb2 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-31 15:07:45 +00:00
d7dc617746 Switched perambulator to sue Eigen::Tensor (file write temporarily excluded) 2019-01-31 15:06:52 +00:00
32cb2e1a9a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-31 13:01:31 +00:00
3d31113337 added test t5 to compute meson fields of different quarks. Different nvec are allowed. 2019-01-31 13:01:16 +00:00
48b6f7e6ad Changed PerambLight<FIMPL> to PerambLight<GIMPL> 2019-01-31 12:37:00 +00:00
0da411fe60 LapEvec fixes 2019-01-31 12:28:38 +00:00
d7b9ed199d PerambLight fixes 2019-01-31 12:24:32 +00:00
7e74f7bec4 tsrc != 0 now works 2019-01-31 11:35:05 +00:00
dae7b30b92 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-30 21:16:30 +00:00
f7e4661ca0 Fixed grid3d leak in PerambLight 2019-01-30 21:16:09 +00:00
ferben
7b66197534 meson fields are now the same 2019-01-30 18:03:34 +00:00
ferben
c3273eff20 agreement up to laph vectors 2019-01-30 11:20:22 +00:00
ferben
67a3d7aeed added debug output, perambulators now agree up to 8 digits 2019-01-29 16:24:59 +00:00
ferben
d8831fe925 changed parameters to match Test_Distil 2019-01-29 13:40:26 +00:00
c7ceff6a21 Switched to Gauge field (GIMPL) 2019-01-28 12:28:35 +00:00
ferben
5580b3a7d1 bugfix in DistilVectors 2019-01-28 12:24:47 +00:00
33d8fb2dd9 Default 2019-01-25 19:21:12 +00:00
9f6f776460 ensured there is a default test to run 2019-01-25 19:14:22 +00:00
ferben
84fe36d084 meson functions work until to be saved 2019-01-25 17:26:43 +00:00
ferben
3438dde8df test prog now computes everything up to meson fields 2019-01-25 15:19:18 +00:00
ferben
aea49bc349 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-25 13:44:30 +00:00
ferben
9ef6f9878e test works up to perambulators now 2019-01-25 13:44:19 +00:00
708ca8585a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-25 13:26:56 +00:00
d15bf4b8e1 Added trajectory number to output file 2019-01-25 13:26:48 +00:00
ferben
7496da0987 bugfix in prambLight 2019-01-25 13:08:56 +00:00
ferben
2568f5b925 bugfix in prambLight 2019-01-25 12:37:18 +00:00
577cdf1d72 Simplified tests 2019-01-24 18:50:18 +00:00
f92ed659a7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-24 16:30:28 +00:00
dfb7fb1d9f LapEvec test works on --grid 4.4.4.8 2019-01-24 16:30:13 +00:00
a4c1ab6147 all modules linked in test prog 2019-01-24 16:12:19 +00:00
cf85f0388d Still debugging eigenvector parameters 2019-01-24 13:26:05 +00:00
00b0f75b0d Eigenvectors created. Still need to correctly set parameters for test. 2019-01-24 12:44:06 +00:00
b45586e81c Discovered bug root cause. setup() is called multiple times. Now ready to copy-paste the LapEvec code 2019-01-23 21:17:56 +00:00
2c7e6bf58b Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 15:20:06 +00:00
7c5a06f6d0 Trying to work out why LapEvec constructor not being called 2019-01-23 15:19:51 +00:00
ferben
068ef85b05 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 15:08:24 +00:00
ferben
a6ab742fdb added perambs to test 2019-01-23 13:58:20 +00:00
2062a8d578 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 13:00:20 +00:00
3d3e8f4f9f Structured objects passed into LapEvec 2019-01-23 12:59:55 +00:00
ferben
2756f16a5e created test prog for perambs 2019-01-23 12:49:20 +00:00
ferben
d7908c33de moved hard-coded parameters in DistilVectors to module input 2019-01-23 11:32:53 +00:00
ferben
4cc2ebc9e4 moved hard-coded parameters to module input 2019-01-23 11:26:07 +00:00
ferben
b8afa7314c Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-23 10:51:23 +00:00
ferben
be5605931c merge 2019-01-23 10:51:09 +00:00
09fa821510 Added remaining methods to Permabulator 2019-01-22 17:59:55 +00:00
ferben
f45d2d5dcc perambLight done, but SliceShare and Write does not work yet 2019-01-22 15:52:26 +00:00
ferben
0a82fae45c moved perambulator definition to shared header file 2019-01-22 15:06:45 +00:00
ferben
46b05aa9c5 cleaned up, deleted commented out old code 2019-01-22 13:48:44 +00:00
ferben
813c1ab1f1 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-22 13:28:09 +00:00
ferben
b1c27a141d DistilVectors complete and compiling - not tested at all! 2019-01-22 13:27:51 +00:00
81bb361299 Test program ready 2019-01-22 13:19:39 +00:00
ferben
79d533550d continued on DistilVectors.hpp 2019-01-21 16:45:31 +00:00
ferben
b8c106f320 working on DistilVectors, initialisation done and compiles 2019-01-21 16:04:18 +00:00
b74492a805 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-21 10:40:01 +00:00
c93a43f158 Added test program 2019-01-21 10:39:28 +00:00
Felix Erben
0ff410ae19 copied perambulato code into PerambLight.hpp 2019-01-18 17:47:41 +00:00
Felix Erben
ced30b61e2 added phi vectors - still commented out and does not compile otherwise 2019-01-18 16:38:13 +00:00
Felix Erben
2b782df290 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-18 15:58:51 +00:00
Felix Erben
f0f1ba0307 uses evec4d now 2019-01-18 15:58:10 +00:00
2343e621e6 Bananas 2019-01-18 13:32:27 +00:00
Felix Erben
2568504821 small change 2019-01-18 13:23:03 +00:00
b821dde020 Initial version 2019-01-18 13:14:28 +00:00
ae3b053334 Initial version 2019-01-18 13:10:02 +00:00
876 changed files with 51317 additions and 40895 deletions

1
.gitignore vendored
View File

@@ -88,6 +88,7 @@ Thumbs.db
# build directory # # build directory #
################### ###################
build*/* build*/*
Documentation/_build
# IDE related files # # IDE related files #
##################### #####################

View File

@@ -1,61 +0,0 @@
language: cpp
cache:
directories:
- clang
matrix:
include:
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@@ -34,10 +34,15 @@ directory
#if defined __GNUC__ && __GNUC__>=6 #if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes" #pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
#if defined __GNUC__
#pragma GCC diagnostic ignored "-Wpsabi"
#endif #endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488 #pragma warning disable 488
#endif
#ifdef __NVCC__ #ifdef __NVCC__
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp

View File

@@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/Threads.h> #include <Grid/threads/ThreadReduction.h>
#include <Grid/serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>

View File

@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h> #include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore); NAMESPACE_CHECK(GridQCDCore);

View File

@@ -6,6 +6,7 @@
/////////////////// ///////////////////
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory>
#include <vector> #include <vector>
#include <array> #include <array>
#include <string> #include <string>
@@ -27,4 +28,7 @@
/////////////////// ///////////////////
#include "Config.h" #include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */ #endif /* GRID_STD_H */

View File

@@ -18,12 +18,29 @@
#pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__") #pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__") #pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__ #undef __NVCC__
#undef __CUDACC__ #undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__ #define __NVCC__REDEFINE__
#endif #endif
/* SYCL save and restore compile environment*/
#ifdef GRID_SYCL
#pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -35,7 +52,20 @@
#pragma pop #pragma pop
#endif #endif
/*SYCL restore*/
#ifdef __SYCL__REDEFINE__
#pragma pop_macro("__SYCL_DEVICE_ONLY__")
#pragma pop
#endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@@ -21,7 +21,8 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h extra_headers+=serialisation/Hdf5Type.h
endif endif
all: version-cache
all: version-cache Version.h
version-cache: version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@@ -42,7 +43,7 @@ version-cache:
fi;\ fi;\
rm -f vertmp rm -f vertmp
Version.h: Version.h: version-cache
cp version-cache Version.h cp version-cache Version.h
.PHONY: version-cache .PHONY: version-cache
@@ -53,6 +54,19 @@ Version.h:
include Make.inc include Make.inc
include Eigen.inc include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources) CCFILES += $(extra_sources)

View File

@@ -29,23 +29,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h> #include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/JacobiPolynomial.h>
#include <Grid/algorithms/approx/Remez.h> #include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h> #include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h> #include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h> #include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h> #include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h> #include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
#include <Grid/algorithms/iterative/MinimalResidual.h> #include <Grid/algorithms/iterative/MinimalResidual.h>
@@ -57,7 +67,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@@ -138,7 +136,7 @@ public:
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1); Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors); sgrid = new GridCartesian(dimensions,layout,processors,*grid);
}; };
~FFT ( void) { ~FFT ( void) {
@@ -184,14 +182,14 @@ public:
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node // Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors); GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View(); autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -232,15 +230,18 @@ public:
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
{
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{ thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate cbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf); peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L; pokeLocalSite(s,p_v,cbuf);
pokeLocalSite(s,pgbuf,cbuf);
}); });
}
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
@@ -269,15 +270,19 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{ thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd); Coordinate clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result_v,clbuf);
}); });
}
result = result*div; result = result*div;
// destroying plan // destroying plan

View File

@@ -43,15 +43,16 @@ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase { template<class Field> class LinearOperatorBase {
public: public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
virtual void OpDirAll (const Field &in, std::vector<Field> &out) = 0; // Abstract base
virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0; virtual void HermOp(const Field &in, Field &out)=0;
virtual ~LinearOperatorBase(){};
}; };
@@ -83,6 +84,9 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
@@ -90,11 +94,13 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); _Mat.MdagM(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
RealD n1,n2; _Mat.MdagM(in,out);
HermOpAndNorm(in,out,n1,n2);
} }
}; };
@@ -116,6 +122,9 @@ public:
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
assert(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
assert(0); assert(0);
@@ -125,17 +134,14 @@ public:
assert(0); assert(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); HermOp(in,out);
out = out + _shift*in; ComplexD dot = innerProduct(in,out);
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
RealD n1,n2; _Mat.MdagM(in,out);
HermOpAndNorm(in,out,n1,n2); out = out + _shift*in;
} }
}; };
@@ -154,6 +160,9 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
@@ -161,8 +170,7 @@ public:
_Mat.M(in,out); _Mat.M(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out); HermOp(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot); ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
@@ -171,29 +179,61 @@ public:
} }
}; };
////////////////////////////////////////////////////////// template<class Matrix,class Field>
// Even Odd Schur decomp operators; there are several class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
// ways to introduce the even odd checkerboarding Matrix &_Mat;
////////////////////////////////////////////////////////// public:
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
template<class Field> //////////////////////////////////////////////////////////
class SchurOperatorBase : public LinearOperatorBase<Field> { // Even Odd Schur decomp operators; there are several
// ways to introduce the even odd checkerboarding
//////////////////////////////////////////////////////////
template<class Field>
class SchurOperatorBase : public LinearOperatorBase<Field> {
public: public:
virtual RealD Mpc (const Field &in, Field &out) =0; virtual void Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0; virtual void MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard(); tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp); Mpc(in,tmp);
no=MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2); MpcDagMpc(in,out);
ComplexD dot= innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
RealD n1,n2; out.Checkerboard() = in.Checkerboard();
HermOpAndNorm(in,out,n1,n2); MpcDagMpc(in,out);
} }
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
Mpc(in,out); Mpc(in,out);
@@ -208,72 +248,69 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
}; };
template<class Matrix,class Field> };
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> { class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public: public:
Matrix &_Mat; Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard(); tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp); _Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out); _Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out); axpy(out,-1.0,tmp,out);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp); _Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out); _Mat.MooeeDag(in,out);
return axpy_norm(out,-1.0,tmp,out); axpy(out,-1.0,tmp,out);
} }
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
class SchurDiagOneOperator : public SchurOperatorBase<Field> { class SchurDiagOneOperator : public SchurOperatorBase<Field> {
protected: protected:
Matrix &_Mat; Matrix &_Mat;
public: public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
_Mat.Meooe(tmp,out); _Mat.Meooe(tmp,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
axpy(out,-1.0,tmp,in);
return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out); _Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
axpy(out,-1.0,tmp,in);
return axpy_norm(out,-1.0,tmp,in);
} }
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
class SchurDiagTwoOperator : public SchurOperatorBase<Field> { class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
protected: protected:
Matrix &_Mat; Matrix &_Mat;
public: public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MooeeInv(in,out); _Mat.MooeeInv(in,out);
@@ -281,9 +318,9 @@ public:
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
return axpy_norm(out,-1.0,tmp,in); axpy(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.MeooeDag(in,out); _Mat.MeooeDag(in,out);
@@ -291,97 +328,192 @@ public:
_Mat.MeooeDag(tmp,out); _Mat.MeooeDag(tmp,out);
_Mat.MooeeInvDag(out,tmp); _Mat.MooeeInvDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in); axpy(out,-1.0,tmp,in);
} }
};
template<class Field>
class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
{
public:
virtual void Mpc (const Field& in, Field& out) = 0;
virtual void MpcDag (const Field& in, Field& out) = 0;
virtual void MpcDagMpc(const Field& in, Field& out) {
Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard();
Mpc(in,tmp);
MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0);
}
virtual void HermOp(const Field& in, Field& out) {
assert(0);
}
void Op(const Field& in, Field& out) {
Mpc(in, out);
}
void AdjOp(const Field& in, Field& out) {
MpcDag(in, out);
}
// Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0);
}
void OpDirAll(const Field& in, std::vector<Field>& out){
assert(0);
}; };
/////////////////////////////////////////////////////////////////////////////////////////////////// };
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi template<class Matrix, class Field>
/////////////////////////////////////////////////////////////////////////////////////////////////// class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase<Field>
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ; {
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ; public:
/////////////////////////////////////////////////////////////////////////////////////////////////// Matrix& _Mat;
// Staggered use NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
/////////////////////////////////////////////////////////////////////////////////////////////////// virtual void Mpc(const Field& in, Field& out) {
template<class Matrix,class Field> Field tmp(in.Grid());
class SchurStaggeredOperator : public SchurOperatorBase<Field> { tmp.Checkerboard() = !in.Checkerboard();
_Mat.Meooe(in, tmp);
_Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp);
_Mat.Mooee(in, out);
axpy(out, -1.0, tmp, out);
}
virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MeooeDag(in, tmp);
_Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp);
_Mat.MooeeDag(in, out);
axpy(out, -1.0, tmp, out);
}
};
template<class Matrix,class Field>
class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field>
{
protected:
Matrix &_Mat;
public:
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.Meooe(in, out);
_Mat.MooeeInv(out, tmp);
_Mat.Meooe(tmp, out);
_Mat.MooeeInv(out, tmp);
axpy(out, -1.0, tmp, in);
}
virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MooeeInvDag(in, out);
_Mat.MeooeDag(out, tmp);
_Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp);
axpy(out, -1.0, tmp, in);
}
};
template<class Matrix, class Field>
class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field>
{
protected:
Matrix& _Mat;
public:
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MooeeInv(in, out);
_Mat.Meooe(out, tmp);
_Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp);
axpy(out, -1.0, tmp, in);
}
virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MeooeDag(in, out);
_Mat.MooeeInvDag(out, tmp);
_Mat.MeooeDag(tmp, out);
_Mat.MooeeInvDag(out, tmp);
axpy(out, -1.0, tmp, in);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Staggered use
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected: protected:
Matrix &_Mat; Matrix &_Mat;
Field tmp; Field tmp;
RealD mass; RealD mass;
double tMpc; public:
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public:
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
assert( _Mat.isTrivialEE() ); assert( _Mat.isTrivialEE() );
mass = _Mat.Mass(); mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++; Mpc(in,out);
tMpc-=usecond();
n2 = Mpc(in,out);
tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out); ComplexD dot= innerProduct(in,out);
tIP+=usecond();
n1 = real(dot); n1 = real(dot);
n2 =0.0;
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
ncall++; Mpc(in,out);
tMpc-=usecond(); // _Mat.Meooe(in,out);
_Mat.Meooe(in,out); // _Mat.Meooe(out,tmp);
_Mat.Meooe(out,tmp); // axpby(out,-1.0,mass*mass,tmp,in);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) virtual void Mpc (const Field &in, Field &out)
{ {
Field tmp(in.Grid()); Field tmp(in.Grid());
Field tmp2(in.Grid()); Field tmp2(in.Grid());
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl; // _Mat.Mooee(in,out);
_Mat.Mooee(in,out); // _Mat.Mooee(out,tmp);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond();
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
tMeo+=usecond(); axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
return Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out) {
assert(0);// Never need with staggered assert(0);// Never need with staggered
} }
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for functions of operators // Base classes for functions of operators
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@@ -399,6 +531,16 @@ public:
template<class Field> class LinearFunction { template<class Field> class LinearFunction {
public: public:
virtual void operator() (const Field &in, Field &out) = 0; virtual void operator() (const Field &in, Field &out) = 0;
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{
assert(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i)
{
(*this)(in[i], out[i]);
}
}
}; };
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
@@ -444,6 +586,7 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
template<typename Field> template<typename Field>
class PlainHermOp : public LinearFunction<Field> { class PlainHermOp : public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
LinearOperatorBase<Field> &_Linop; LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
@@ -457,6 +600,7 @@ public:
template<typename Field> template<typename Field>
class FunctionHermOp : public LinearFunction<Field> { class FunctionHermOp : public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
OperatorFunction<Field> & _poly; OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop; LinearOperatorBase<Field> &_Linop;

View File

@@ -30,13 +30,19 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field> using Preconditioner = LinearFunction<Field> ;
/*
template<class Field> class Preconditioner : public LinearFunction<Field> { template<class Field> class Preconditioner : public LinearFunction<Field> {
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field & psi)=0; virtual void operator()(const Field &src, Field & psi)=0;
}; };
*/
template<class Field> class TrivialPrecon : public Preconditioner<Field> { template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public: public:
void operator()(const Field &src, Field & psi){ using Preconditioner<Field>::operator();
virtual void operator()(const Field &src, Field & psi){
psi = src; psi = src;
} }
TrivialPrecon(void){}; TrivialPrecon(void){};

View File

@@ -38,15 +38,17 @@ template<class Field> class SparseMatrixBase {
public: public:
virtual GridBase *Grid(void) =0; virtual GridBase *Grid(void) =0;
// Full checkerboar operations // Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0; virtual void M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0; virtual void Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MdagM(const Field &in, Field &out) {
Field tmp (in.Grid()); Field tmp (in.Grid());
ni=M(in,tmp); M(in,tmp);
no=Mdag(tmp,out); Mdag(tmp,out);
} }
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
virtual ~SparseMatrixBase() {};
}; };
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -71,7 +73,7 @@ public:
virtual void MeooeDag (const Field &in, Field &out)=0; virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0; virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0; virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual ~CheckerBoardedSparseMatrixBase() {};
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -95,6 +95,24 @@ public:
Coeffs[order-1] = 1.; Coeffs[order-1] = 1.;
}; };
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
// Similar kick effect below the threshold as Lanczos filter approach
void InitLowPass(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD k=(order-1.0);
RealD s=std::cos( j*M_PI*(k+0.5)/order );
Coeffs[j] = s * 2.0/order;
}
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD)) void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{ {
lo=_lo; lo=_lo;
@@ -216,10 +234,8 @@ public:
GridBase *grid=in.Grid(); GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites(); int vol=grid->gSites();
typedef typename Field::vector_type vector_type;
Field T0(grid); T0 = in; Field T0(grid); T0 = in;
Field T1(grid); Field T1(grid);
@@ -234,20 +250,34 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
T1=y*xscale+in*mscale; axpby(T1,xscale,mscale,y,in);
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;
axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
for(int n=2;n<order;n++){ for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y); Linop.HermOp(*Tn,y);
#if 0
y=xscale*y+mscale*(*Tn); auto y_v = y.View();
auto Tn_v = Tn->View();
*Tnp=2.0*y-(*Tnm); auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
out=out+Coeffs[n]* (*Tnp); constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#else
axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#endif
// Cycle pointers to avoid copies // Cycle pointers to avoid copies
Field *swizzle = Tnm; Field *swizzle = Tnm;
Tnm =Tn; Tnm =Tn;
@@ -262,6 +292,7 @@ public:
template<class Field> template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> { class ChebyshevLanczos : public Chebyshev<Field> {
private: private:
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
int order; int order;
RealD alpha; RealD alpha;

View File

@@ -0,0 +1,129 @@
#ifndef GRID_JACOBIPOLYNOMIAL_H
#define GRID_JACOBIPOLYNOMIAL_H
#include <Grid/algorithms/LinearOperator.h>
NAMESPACE_BEGIN(Grid);
template<class Field>
class JacobiPolynomial : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
int order;
RealD hi;
RealD lo;
RealD alpha;
RealD beta;
public:
void csv(std::ostream &out){
csv(out,lo,hi);
}
void csv(std::ostream &out,RealD llo,RealD hhi){
RealD diff = hhi-llo;
RealD delta = diff*1.0e-5;
for (RealD x=llo-delta; x<=hhi; x+=delta) {
RealD f = approx(x);
out<< x<<" "<<f <<std::endl;
}
return;
}
JacobiPolynomial(){};
JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
{
lo=_lo;
hi=_hi;
alpha=_alpha;
beta=_beta;
order=_order;
};
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1.0;
RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
Tn =T1;
Tnm=T0;
for(int n=2;n<=order;n++){
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
Tnm=Tn;
Tn =Tnp;
}
return Tnp;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid);
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// RealD T0=1.0;
T0=in;
// RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
// = x * 2/(hi-lo) - (hi+lo)/(hi-lo)
Linop.HermOp(T0,y);
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
y=y*xscale+in*mscale;
// RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
RealD halfAmB = (alpha-beta)*0.5;
RealD halfApBp2= (alpha+beta+2.0)*0.5;
T1 = halfAmB * in + halfApBp2*y;
for(int n=2;n<=order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
// Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
cny=cny/cnp;
cn1=cn1/cnp;
cn1=cn1/cnp;
cnm=cnm/cnp;
*Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
out=*Tnp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,473 @@
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
#include<string>
#include<iostream>
#include<iomanip>
#include<cassert>
#include<Grid/algorithms/approx/RemezGeneral.h>
// Constructor
AlgRemezGeneral::AlgRemezGeneral(double lower, double upper, long precision,
bigfloat (*f)(bigfloat x, void *data), void *data): f(f),
data(data),
prec(precision),
apstrt(lower), apend(upper), apwidt(upper - lower),
n(0), d(0), pow_n(0), pow_d(0)
{
bigfloat::setDefaultPrecision(prec);
std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
}
//Determine the properties of the numerator and denominator polynomials
void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in){
pow_n = num_degree;
pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
num_type = num_type_in;
den_type = den_type_in;
num_pows.resize(pow_n+1);
den_pows.resize(pow_d+1);
int n_in = 0;
bool odd = num_type == PolyType::Full || num_type == PolyType::Odd;
bool even = num_type == PolyType::Full || num_type == PolyType::Even;
for(int i=0;i<=pow_n;i++){
num_pows[i] = -1;
if(i % 2 == 0 && even) num_pows[i] = n_in++;
if(i % 2 == 1 && odd) num_pows[i] = n_in++;
}
std::cout << n_in << " terms in numerator" << std::endl;
--n_in; //power is 1 less than the number of terms, eg pow=1 a x^1 + b x^0
int d_in = 0;
odd = den_type == PolyType::Full || den_type == PolyType::Odd;
even = den_type == PolyType::Full || den_type == PolyType::Even;
for(int i=0;i<=pow_d;i++){
den_pows[i] = -1;
if(i % 2 == 0 && even) den_pows[i] = d_in++;
if(i % 2 == 1 && odd) den_pows[i] = d_in++;
}
std::cout << d_in << " terms in denominator" << std::endl;
--d_in;
n = n_in;
d = d_in;
}
//Setup algorithm
void AlgRemezGeneral::reinitializeAlgorithm(){
spread = 1.0e37;
iter = 0;
neq = n + d + 1; //not +2 because highest-power term in denominator is fixed to 1
param.resize(neq);
yy.resize(neq+1);
//Initialize linear equation temporaries
A.resize(neq*neq);
B.resize(neq);
IPS.resize(neq);
//Initialize maximum and minimum errors
xx.resize(neq+2);
mm.resize(neq+1);
initialGuess();
//Initialize search steps
step.resize(neq+1);
stpini();
}
double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degree,
const PolyType num_type_in, const PolyType den_type_in,
const double _tolerance, const int report_freq){
//Setup the properties of the polynomial
setupPolyProperties(num_degree, den_degree, num_type_in, den_type_in);
//Setup the algorithm
reinitializeAlgorithm();
bigfloat tolerance = _tolerance;
//Iterate until convergance
while (spread > tolerance) {
if (iter++ % report_freq==0)
std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta << std::endl;
equations();
if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
assert(0);
};
assert( delta>= tolerance );
search();
}
int sign;
double error = (double)getErr(mm[0],&sign);
std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
// Return the maximum error in the approximation
return error;
}
// Initial values of maximal and minimal errors
void AlgRemezGeneral::initialGuess(){
// Supply initial guesses for solution points
long ncheb = neq; // Degree of Chebyshev error estimate
// Find ncheb+1 extrema of Chebyshev polynomial
bigfloat a = ncheb;
bigfloat r;
mm[0] = apstrt;
for (long i = 1; i < ncheb; i++) {
r = 0.5 * (1 - cos((M_PI * i)/(double) a));
//r *= sqrt_bf(r);
r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
mm[i] = apstrt + r * apwidt;
}
mm[ncheb] = apend;
a = 2.0 * ncheb;
for (long i = 0; i <= ncheb; i++) {
r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
//r *= sqrt_bf(r); // Squeeze to low end of interval
r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
xx[i] = apstrt + r * apwidt;
}
}
// Initialise step sizes
void AlgRemezGeneral::stpini(){
xx[neq+1] = apend;
delta = 0.25;
step[0] = xx[0] - apstrt;
for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
step[neq] = step[neq-1];
}
// Search for error maxima and minima
void AlgRemezGeneral::search(){
bigfloat a, q, xm, ym, xn, yn, xx1;
int emsign, ensign, steps;
int meq = neq + 1;
bigfloat eclose = 1.0e30;
bigfloat farther = 0l;
bigfloat xx0 = apstrt;
for (int i = 0; i < meq; i++) {
steps = 0;
xx1 = xx[i]; // Next zero
if (i == meq-1) xx1 = apend;
xm = mm[i];
ym = getErr(xm,&emsign);
q = step[i];
xn = xm + q;
if (xn < xx0 || xn >= xx1) { // Cannot skip over adjacent boundaries
q = -q;
xn = xm;
yn = ym;
ensign = emsign;
} else {
yn = getErr(xn,&ensign);
if (yn < ym) {
q = -q;
xn = xm;
yn = ym;
ensign = emsign;
}
}
while(yn >= ym) { // March until error becomes smaller.
if (++steps > 10)
break;
ym = yn;
xm = xn;
emsign = ensign;
a = xm + q;
if (a == xm || a <= xx0 || a >= xx1)
break;// Must not skip over the zeros either side.
xn = a;
yn = getErr(xn,&ensign);
}
mm[i] = xm; // Position of maximum
yy[i] = ym; // Value of maximum
if (eclose > ym) eclose = ym;
if (farther < ym) farther = ym;
xx0 = xx1; // Walk to next zero.
} // end of search loop
q = (farther - eclose); // Decrease step size if error spread increased
if (eclose != 0.0) q /= eclose; // Relative error spread
if (q >= spread)
delta *= 0.5; // Spread is increasing; decrease step size
spread = q;
for (int i = 0; i < neq; i++) {
q = yy[i+1];
if (q != 0.0) q = yy[i] / q - (bigfloat)1l;
else q = 0.0625;
if (q > (bigfloat)0.25) q = 0.25;
q *= mm[i+1] - mm[i];
step[i] = q * delta;
}
step[neq] = step[neq-1];
for (int i = 0; i < neq; i++) { // Insert new locations for the zeros.
xm = xx[i] - step[i];
if (xm <= apstrt)
continue;
if (xm >= apend)
continue;
if (xm <= mm[i])
xm = (bigfloat)0.5 * (mm[i] + xx[i]);
if (xm >= mm[i+1])
xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
xx[i] = xm;
}
}
// Solve the equations
void AlgRemezGeneral::equations(){
bigfloat x, y, z;
bigfloat *aa;
for (int i = 0; i < neq; i++) { // set up the equations for solution by simq()
int ip = neq * i; // offset to 1st element of this row of matrix
x = xx[i]; // the guess for this row
y = func(x); // right-hand-side vector
z = (bigfloat)1l;
aa = A.data()+ip;
int t = 0;
for (int j = 0; j <= pow_n; j++) {
if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x;
}
assert(t == n+1);
z = (bigfloat)1l;
t = 0;
for (int j = 0; j < pow_d; j++) {
if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x;
}
assert(t == d);
B[i] = y * z; // Right hand side vector
}
// Solve the simultaneous linear equations.
if (simq()){
std::cout<<"simq failed\n";
exit(0);
}
}
// Evaluate the rational form P(x)/Q(x) using coefficients
// from the solution vector param
bigfloat AlgRemezGeneral::approx(const bigfloat x) const{
// Work backwards toward the constant term.
int c = n;
bigfloat yn = param[c--]; // Highest order numerator coefficient
for (int i = pow_n-1; i >= 0; i--) yn = x * yn + (num_pows[i] != -1 ? param[c--] : bigfloat(0l));
c = n+d;
bigfloat yd = 1l; //Highest degree coefficient is 1.0
for (int i = pow_d-1; i >= 0; i--) yd = x * yd + (den_pows[i] != -1 ? param[c--] : bigfloat(0l));
return(yn/yd);
}
// Compute size and sign of the approximation error at x
bigfloat AlgRemezGeneral::getErr(bigfloat x, int *sign) const{
bigfloat f = func(x);
bigfloat e = approx(x) - f;
if (f != 0) e /= f;
if (e < (bigfloat)0.0) {
*sign = -1;
e = -e;
}
else *sign = 1;
return(e);
}
// Solve the system AX=B
int AlgRemezGeneral::simq(){
int ip, ipj, ipk, ipn;
int idxpiv;
int kp, kp1, kpk, kpn;
int nip, nkp;
bigfloat em, q, rownrm, big, size, pivot, sum;
bigfloat *aa;
bigfloat *X = param.data();
int n = neq;
int nm1 = n - 1;
// Initialize IPS and X
int ij = 0;
for (int i = 0; i < n; i++) {
IPS[i] = i;
rownrm = 0.0;
for(int j = 0; j < n; j++) {
q = abs_bf(A[ij]);
if(rownrm < q) rownrm = q;
++ij;
}
if (rownrm == (bigfloat)0l) {
std::cout<<"simq rownrm=0\n";
return(1);
}
X[i] = (bigfloat)1.0 / rownrm;
}
for (int k = 0; k < nm1; k++) {
big = 0.0;
idxpiv = 0;
for (int i = k; i < n; i++) {
ip = IPS[i];
ipk = n*ip + k;
size = abs_bf(A[ipk]) * X[ip];
if (size > big) {
big = size;
idxpiv = i;
}
}
if (big == (bigfloat)0l) {
std::cout<<"simq big=0\n";
return(2);
}
if (idxpiv != k) {
int j = IPS[k];
IPS[k] = IPS[idxpiv];
IPS[idxpiv] = j;
}
kp = IPS[k];
kpk = n*kp + k;
pivot = A[kpk];
kp1 = k+1;
for (int i = kp1; i < n; i++) {
ip = IPS[i];
ipk = n*ip + k;
em = -A[ipk] / pivot;
A[ipk] = -em;
nip = n*ip;
nkp = n*kp;
aa = A.data()+nkp+kp1;
for (int j = kp1; j < n; j++) {
ipj = nip + j;
A[ipj] = A[ipj] + em * *aa++;
}
}
}
kpn = n * IPS[n-1] + n - 1; // last element of IPS[n] th row
if (A[kpn] == (bigfloat)0l) {
std::cout<<"simq A[kpn]=0\n";
return(3);
}
ip = IPS[0];
X[0] = B[ip];
for (int i = 1; i < n; i++) {
ip = IPS[i];
ipj = n * ip;
sum = 0.0;
for (int j = 0; j < i; j++) {
sum += A[ipj] * X[j];
++ipj;
}
X[i] = B[ip] - sum;
}
ipn = n * IPS[n-1] + n - 1;
X[n-1] = X[n-1] / A[ipn];
for (int iback = 1; iback < n; iback++) {
//i goes (n-1),...,1
int i = nm1 - iback;
ip = IPS[i];
nip = n*ip;
sum = 0.0;
aa = A.data()+nip+i+1;
for (int j= i + 1; j < n; j++)
sum += *aa++ * X[j];
X[i] = (X[i] - sum) / A[nip+i];
}
return(0);
}
void AlgRemezGeneral::csv(std::ostream & os) const{
os << "Numerator" << std::endl;
for(int i=0;i<=pow_n;i++){
os << getCoeffNum(i) << "*x^" << i;
if(i!=pow_n) os << " + ";
}
os << std::endl;
os << "Denominator" << std::endl;
for(int i=0;i<=pow_d;i++){
os << getCoeffDen(i) << "*x^" << i;
if(i!=pow_d) os << " + ";
}
os << std::endl;
//For a true minimax solution the errors should all be equal and the signs should oscillate +-+-+- etc
int sign;
os << "Errors at maxima: coordinate, error, (sign)" << std::endl;
for(int i=0;i<neq+1;i++){
os << mm[i] << " " << getErr(mm[i],&sign) << " (" << sign << ")" << std::endl;
}
os << "Scan over range:" << std::endl;
int npt = 60;
bigfloat dlt = (apend - apstrt)/bigfloat(npt-1);
for (bigfloat x=apstrt; x<=apend; x = x + dlt) {
double f = evaluateFunc(x);
double r = evaluateApprox(x);
os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
}
return;
}

View File

@@ -0,0 +1,170 @@
/*
C.Kelly Jan 2020 based on implementation by M. Clark May 2005
AlgRemezGeneral is an implementation of the Remez algorithm for approximating an arbitrary function by a rational polynomial
It includes optional restriction to odd/even polynomials for the numerator and/or denominator
*/
#ifndef INCLUDED_ALG_REMEZ_GENERAL_H
#define INCLUDED_ALG_REMEZ_GENERAL_H
#include <stddef.h>
#include <Grid/GridStd.h>
#ifdef HAVE_LIBGMP
#include "bigfloat.h"
#else
#include "bigfloat_double.h"
#endif
class AlgRemezGeneral{
public:
enum PolyType { Even, Odd, Full };
private:
// In GSL-style, pass the function as a function pointer. Any data required to evaluate the function is passed in as a void pointer
bigfloat (*f)(bigfloat x, void *data);
void *data;
// The approximation parameters
std::vector<bigfloat> param;
bigfloat norm;
// The number of non-zero terms in the numerator and denominator
int n, d;
// The numerator and denominator degree (i.e. the largest power)
int pow_n, pow_d;
// Specify if the numerator and/or denominator are odd/even polynomials
PolyType num_type;
PolyType den_type;
std::vector<int> num_pows; //contains the mapping, with -1 if not present
std::vector<int> den_pows;
// The bounds of the approximation
bigfloat apstrt, apwidt, apend;
// Variables used to calculate the approximation
int nd1, iter;
std::vector<bigfloat> xx;
std::vector<bigfloat> mm;
std::vector<bigfloat> step;
bigfloat delta, spread;
// Variables used in search
std::vector<bigfloat> yy;
// Variables used in solving linear equations
std::vector<bigfloat> A;
std::vector<bigfloat> B;
std::vector<int> IPS;
// The number of equations we must solve at each iteration (n+d+1)
int neq;
// The precision of the GNU MP library
long prec;
// Initialize member variables associated with the polynomial's properties
void setupPolyProperties(int num_degree, int den_degree, PolyType num_type_in, PolyType den_type_in);
// Initial values of maximal and minmal errors
void initialGuess();
// Initialise step sizes
void stpini();
// Initialize the algorithm
void reinitializeAlgorithm();
// Solve the equations
void equations();
// Search for error maxima and minima
void search();
// Calculate function required for the approximation
inline bigfloat func(bigfloat x) const{
return f(x, data);
}
// Compute size and sign of the approximation error at x
bigfloat getErr(bigfloat x, int *sign) const;
// Solve the system AX=B where X = param
int simq();
// Evaluate the rational form P(x)/Q(x) using coefficients from the solution vector param
bigfloat approx(bigfloat x) const;
public:
AlgRemezGeneral(double lower, double upper, long prec,
bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{
assert(n==d);
return n;
}
// Reset the bounds of the approximation
inline void setBounds(double lower, double upper) {
apstrt = lower;
apend = upper;
apwidt = apend - apstrt;
}
// Get the bounds of the approximation
inline void getBounds(double &lower, double &upper) const{
lower=(double)apstrt;
upper=(double)apend;
}
// Run the algorithm to generate the rational approximation
double generateApprox(int num_degree, int den_degree,
PolyType num_type, PolyType den_type,
const double tolerance = 1e-15, const int report_freq = 1000);
inline double generateApprox(int num_degree, int den_degree,
const double tolerance = 1e-15, const int report_freq = 1000){
return generateApprox(num_degree, den_degree, Full, Full, tolerance, report_freq);
}
// Evaluate the rational form P(x)/Q(x) using coefficients from the
// solution vector param
inline double evaluateApprox(double x) const{
return (double)approx((bigfloat)x);
}
// Evaluate the rational form Q(x)/P(x) using coefficients from the solution vector param
inline double evaluateInverseApprox(double x) const{
return 1.0/(double)approx((bigfloat)x);
}
// Calculate function required for the approximation
inline double evaluateFunc(double x) const{
return (double)func((bigfloat)x);
}
// Calculate inverse function required for the approximation
inline double evaluateInverseFunc(double x) const{
return 1.0/(double)func((bigfloat)x);
}
// Dump csv of function, approx and error
void csv(std::ostream &os = std::cout) const;
// Get the coefficient of the term x^i in the numerator
inline double getCoeffNum(const int i) const{
return num_pows[i] == -1 ? 0. : double(param[num_pows[i]]);
}
// Get the coefficient of the term x^i in the denominator
inline double getCoeffDen(const int i) const{
if(i == pow_d) return 1.0;
else return den_pows[i] == -1 ? 0. : double(param[den_pows[i]+n+1]);
}
};
#endif

View File

@@ -0,0 +1,183 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/ZMobius.cc
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/algorithms/approx/ZMobius.h>
#include <Grid/algorithms/approx/RemezGeneral.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
//Compute the tanh approximation
inline double epsilonMobius(const double x, const std::vector<ComplexD> &w){
int Ls = w.size();
ComplexD fxp = 1., fmp = 1.;
for(int i=0;i<Ls;i++){
fxp = fxp * ( w[i] + x );
fmp = fmp * ( w[i] - x );
}
return ((fxp - fmp)/(fxp + fmp)).real();
}
inline double epsilonMobius(const double x, const std::vector<RealD> &w){
int Ls = w.size();
double fxp = 1., fmp = 1.;
for(int i=0;i<Ls;i++){
fxp = fxp * ( w[i] + x );
fmp = fmp * ( w[i] - x );
}
return (fxp - fmp)/(fxp + fmp);
}
//Compute the tanh approximation in a form suitable for the Remez
bigfloat epsilonMobius(bigfloat x, void* data){
const std::vector<RealD> &omega = *( (std::vector<RealD> const*)data );
bigfloat fxp(1.0);
bigfloat fmp(1.0);
for(int i=0;i<omega.size();i++){
fxp = fxp * ( bigfloat(omega[i]) + x);
fmp = fmp * ( bigfloat(omega[i]) - x);
}
return (fxp - fmp)/(fxp + fmp);
}
//Compute the Zmobius Omega parameters suitable for eigenvalue range -lambda_bound <= lambda <= lambda_bound
//Note omega_i = 1/(b_i + c_i) where b_i and c_i are the Mobius parameters
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){
assert(omega_in.size() == Ls_in);
omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial
//For odd polynomial, to satisfy Haar condition must take either positive or negative half of range (cf https://arxiv.org/pdf/0803.0439.pdf page 6)
AlgRemezGeneral remez(0, lambda_bound, 64, &epsilonMobius, (void*)&omega_in);
remez.generateApprox(Ls_out-1, Ls_out,AlgRemezGeneral::Odd, AlgRemezGeneral::Even, 1e-15, 100);
remez.csv(std::cout);
//The rational approximation has the form [ f(x) - f(-x) ] / [ f(x) + f(-x) ] where f(x) = \Prod_{i=0}^{L_s-1} ( \omega_i + x )
//cf https://academiccommons.columbia.edu/doi/10.7916/D8T72HD7 pg 102
//omega_i are therefore the negative of the complex roots of f(x)
//We can find the roots by recognizing that the eigenvalues of a matrix A are the roots of the characteristic polynomial
// \rho(\lambda) = det( A - \lambda I ) where I is the unit matrix
//The matrix whose characteristic polynomial is an arbitrary monic polynomial a0 + a1 x + a2 x^2 + ... x^n is the companion matrix
// A = | 0 1 0 0 0 .... 0 |
// | 0 0 1 0 0 .... 0 |
// | : : : : : : |
// | 0 0 0 0 0 1
// | -a0 -a1 -a2 ... ... -an|
//Note the Remez defines the largest power to have unit coefficient
std::vector<RealD> coeffs(Ls_out+1);
for(int i=0;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffDen(i); //even powers
for(int i=1;i<Ls_out+1;i+=2) coeffs[i] = coeffs[i] = remez.getCoeffNum(i); //odd powers
std::vector<std::complex<RealD> > roots(Ls_out);
//Form the companion matrix
Eigen::MatrixXd compn(Ls_out,Ls_out);
for(int i=0;i<Ls_out-1;i++) compn(i,0) = 0.;
compn(Ls_out - 1, 0) = -coeffs[0];
for(int j=1;j<Ls_out;j++){
for(int i=0;i<Ls_out-1;i++) compn(i,j) = i == j-1 ? 1. : 0.;
compn(Ls_out - 1, j) = -coeffs[j];
}
//Eigensolve
Eigen::EigenSolver<Eigen::MatrixXd> slv(compn, false);
const auto & ev = slv.eigenvalues();
for(int i=0;i<Ls_out;i++)
omega_out[i] = -ev(i);
//Sort ascending (smallest at start of vector!)
std::sort(omega_out.begin(), omega_out.end(),
[&](const ComplexD &a, const ComplexD &b){ return a.real() < b.real() || (a.real() == b.real() && a.imag() < b.imag()); });
//McGlynn thesis pg 122 suggest improved iteration counts if magnitude of omega diminishes towards the center of the 5th dimension
std::vector<ComplexD> omega_tmp = omega_out;
int s_low=0, s_high=Ls_out-1, ss=0;
for(int s_from = Ls_out-1; s_from >= 0; s_from--){ //loop from largest omega
int s_to;
if(ss % 2 == 0){
s_to = s_low++;
}else{
s_to = s_high--;
}
omega_out[s_to] = omega_tmp[s_from];
++ss;
}
std::cout << "Resulting omega_i:" << std::endl;
for(int i=0;i<Ls_out;i++)
std::cout << omega_out[i] << std::endl;
std::cout << "Test result matches the approximate polynomial found by the Remez" << std::endl;
std::cout << "<x> <remez approx> <poly approx> <diff poly approx remez approx> <exact> <diff poly approx exact>\n";
int npt = 60;
double dlt = lambda_bound/double(npt-1);
for (int i =0; i<npt; i++){
double x = i*dlt;
double r = remez.evaluateApprox(x);
double p = epsilonMobius(x, omega_out);
double e = epsilonMobius(x, omega_in);
std::cout << x<< " " << r << " " << p <<" " <<r-p << " " << e << " " << e-p << std::endl;
}
}
//mobius_param = b+c with b-c=1
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
std::vector<RealD> omega_in(Ls_in, 1./mobius_param);
computeZmobiusOmega(omega_out, Ls_out, omega_in, Ls_in, lambda_bound);
}
//ZMobius class takes gamma_i = (b+c) omega_i as its input, where b, c are factored out
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out,
const RealD mobius_param_out, const int Ls_out,
const RealD mobius_param_in, const int Ls_in,
const RealD lambda_bound){
computeZmobiusOmega(gamma_out, Ls_out, mobius_param_in, Ls_in, lambda_bound);
for(int i=0;i<Ls_out;i++) gamma_out[i] = gamma_out[i] * mobius_param_out;
}
//Assumes mobius_param_out == mobius_param_in
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound){
computeZmobiusGamma(gamma_out, mobius_param, Ls_out, mobius_param, Ls_in, lambda_bound);
}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,57 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/ZMobius.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ZMOBIUS_APPROX_H
#define GRID_ZMOBIUS_APPROX_H
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
//Compute the Zmobius Omega parameters suitable for eigenvalue range -lambda_bound <= lambda <= lambda_bound
//Note omega_i = 1/(b_i + c_i) where b_i and c_i are the Mobius parameters
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound);
//mobius_param = b+c with b-c=1
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
//ZMobius class takes gamma_i = (b+c) omega_i as its input, where b, c are factored out
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out,
const RealD mobius_param_out, const int Ls_out,
const RealD mobius_param_in, const int Ls_in,
const RealD lambda_bound);
//Assumes mobius_param_out == mobius_param_in
void computeZmobiusGamma(std::vector<ComplexD> &gamma_out, const int Ls_out, const RealD mobius_param, const int Ls_in, const RealD lambda_bound);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@@ -25,6 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef INCLUDED_BIGFLOAT_DOUBLE_H
#define INCLUDED_BIGFLOAT_DOUBLE_H
#include <math.h> #include <math.h>
typedef double mfloat; typedef double mfloat;
@@ -186,4 +190,6 @@ public:
// friend bigfloat& random(void); // friend bigfloat& random(void);
}; };
#endif

View File

@@ -0,0 +1,234 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BiCGSTAB.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: juettner <juettner@soton.ac.uk>
Author: David Murphy <djmurphy@mit.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BICGSTAB_H
#define GRID_BICGSTAB_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template <class Field>
class BiCGSTAB : public OperatorFunction<Field>
{
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
BiCGSTAB(RealD tol, Integer maxit, bool err_on_no_conv = true) :
Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field>& Linop, const Field& src, Field& psi)
{
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp(0), rho(1), rho_prev(0), alpha(1), beta(0), omega(1);
RealD a(0), bo(0), b(0), ssq(0);
Field p(src);
Field r(src);
Field rhat(src);
Field v(src);
Field s(src);
Field t(src);
Field h(src);
v = Zero();
p = Zero();
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop.Op(psi, v);
b = norm2(v);
r = src - v;
rhat = r;
a = norm2(r);
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: mp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: r " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if(a <= rsq){ return; }
std::cout << GridLogIterative << std::setprecision(8) << "BiCGSTAB: k=0 residual " << a << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++)
{
rho_prev = rho;
LinalgTimer.Start();
InnerTimer.Start();
ComplexD Crho = innerProduct(rhat,r);
InnerTimer.Stop();
rho = Crho.real();
beta = (rho / rho_prev) * (alpha / omega);
LinearCombTimer.Start();
bo = beta * omega;
{
autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
MatrixTimer.Start();
Linop.Op(p,v);
MatrixTimer.Stop();
LinalgTimer.Start();
InnerTimer.Start();
ComplexD Calpha = innerProduct(rhat,v);
InnerTimer.Stop();
alpha = rho / Calpha.real();
LinearCombTimer.Start();
{
autoView( p_v , p, AcceleratorRead);
autoView( r_v , r, AcceleratorRead);
autoView( v_v , v, AcceleratorRead);
autoView( psi_v,psi, AcceleratorRead);
autoView( h_v , h, AcceleratorWrite);
autoView( s_v , s, AcceleratorWrite);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
});
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
MatrixTimer.Start();
Linop.Op(s,t);
MatrixTimer.Stop();
LinalgTimer.Start();
InnerTimer.Start();
ComplexD Comega = innerProduct(t,s);
InnerTimer.Stop();
omega = Comega.real() / norm2(t);
LinearCombTimer.Start();
{
autoView( psi_v,psi, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
autoView( h_v , h, AcceleratorRead);
autoView( s_v , s, AcceleratorRead);
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
}
LinearCombTimer.Stop();
cp = norm2(r);
LinalgTimer.Stop();
std::cout << GridLogIterative << "BiCGSTAB: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
// Stopping condition
if(cp <= rsq)
{
SolverTimer.Stop();
Linop.Op(psi, v);
p = v - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "BiCGSTAB Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp/ssq) << std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual << std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown " << std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ assert(0); }
IterationsToComplete = k;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,159 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BiCGSTABMixedPrec.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
Author: David Murphy <djmurphy@mit.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BICGSTAB_MIXED_PREC_H
#define GRID_BICGSTAB_MIXED_PREC_H
NAMESPACE_BEGIN(Grid);
// Mixed precision restarted defect correction BiCGSTAB
template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
{
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; // Grid for single-precision fields
RealD OuterLoopNormMult; // Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionBiCGSTAB(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid,
LinearOperatorBase<FieldF>& _Linop_f, LinearOperatorBase<FieldD>& _Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit),
MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), OuterLoopNormMult(100.), guesser(NULL) {};
void useGuesser(LinearFunction<FieldF>& g){
guesser = &g;
}
void operator() (const FieldD& src_d_in, FieldD& sol_d)
{
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.Checkerboard();
sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb;
BiCGSTAB<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++)
{
// Compute double precision rsd and also new RHS vector.
Linop_d.Op(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration " << outer_iter << " residual " << norm << " target " << stop << std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Outer iteration converged on iteration " << outer_iter << std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop){ inner_tol *= 2; } // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
sol_f = Zero();
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL){ (*guesser)(src_f, sol_f); }
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
//Final trial CG
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Starting final patch-up double-precision solve" << std::endl;
BiCGSTAB<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop();
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout << GridLogMessage << "MixedPrecisionBiCGSTAB: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -52,6 +52,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer PrintInterval; //GridLogMessages or Iterative Integer PrintInterval; //GridLogMessages or Iterative
RealD TrueResidual;
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100) : Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
@@ -306,7 +307,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
AD = AD-B; AD = AD-B;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl; TrueResidual = std::sqrt(norm2(AD)/norm2(B));
std::cout << GridLogMessage <<"\tTrue residual is " << TrueResidual <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl; std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@@ -442,7 +444,8 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
Linop.HermOp(Psi, AP); Linop.HermOp(Psi, AP);
AP = AP-Src; AP = AP-Src;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl; TrueResidual = std::sqrt(norm2(AP)/norm2(Src));
std::cout <<GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl; std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@@ -653,7 +656,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
if ( rr > max_resid ) max_resid = rr; if ( rr > max_resid ) max_resid = rr;
} }
std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl; std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) { if ( max_resid < Tolerance*Tolerance ) {
@@ -668,7 +671,8 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]); for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b]; for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl; TrueResidual = std::sqrt(normv(AD)/normv(B));
std::cout << GridLogMessage << "\tTrue residual is " << TrueResidual <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl; std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;

View File

@@ -49,6 +49,7 @@ public:
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
RealD TrueResidual;
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), : Tolerance(tol),
@@ -72,7 +73,6 @@ public:
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
@@ -82,6 +82,14 @@ public:
cp = a; cp = a;
ssq = norm2(src); ssq = norm2(src);
// Handle trivial case of zero src
if (ssq == 0.){
psi = Zero();
IterationsToComplete = 1;
TrueResidual = 0.;
return;
}
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl; std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl; std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl; std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl;
@@ -93,6 +101,7 @@ public:
// Check if guess is really REALLY good :) // Check if guess is really REALLY good :)
if (cp <= rsq) { if (cp <= rsq) {
TrueResidual = std::sqrt(a/ssq);
std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl; std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
IterationsToComplete = 0; IterationsToComplete = 0;
return; return;
@@ -131,18 +140,20 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
auto psi_v = psi.View(); {
auto p_v = p.View(); autoView( psi_v , psi, AcceleratorWrite);
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
autoView( r_v , r, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
}); });
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
@@ -154,26 +165,33 @@ public:
RealD resnorm = std::sqrt(norm2(p)); RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl; << "\tComputed residual " << std::sqrt(cp / ssq)
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; << "\tTrue residual " << true_residual
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl; std::cout << GridLogIterative << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
TrueResidual = true_residual;
return; return;
} }
} }
// Failed. Calculate true residual before giving up
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
TrueResidual = sqrt(norm2(p)/ssq);
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl; std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) assert(0);

View File

@@ -36,6 +36,7 @@ NAMESPACE_BEGIN(Grid);
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> { class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public: public:
using LinearFunction<FieldD>::operator();
RealD Tolerance; RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations; Integer MaxInnerIterations;
@@ -48,6 +49,7 @@ NAMESPACE_BEGIN(Grid);
Integer TotalInnerIterations; //Number of inner CG iterations Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
RealD TrueResidual;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser; LinearFunction<FieldF> *guesser;
@@ -67,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
} }
void operator() (const FieldD &src_d_in, FieldD &sol_d){ void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
TotalInnerIterations = 0; TotalInnerIterations = 0;
GridStopWatch TotalTimer; GridStopWatch TotalTimer;
@@ -79,6 +82,11 @@ NAMESPACE_BEGIN(Grid);
RealD stop = src_norm * Tolerance*Tolerance; RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid(); GridBase* DoublePrecGrid = src_d_in.Grid();
//Generate precision change workspaces
precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
FieldD tmp_d(DoublePrecGrid); FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb; tmp_d.Checkerboard() = cb;
@@ -96,6 +104,7 @@ NAMESPACE_BEGIN(Grid);
FieldF sol_f(SinglePrecGrid); FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb; sol_f.Checkerboard() = cb;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false; CG_f.ErrorOnNoConverge = false;
@@ -119,7 +128,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d); precisionChange(src_f, src_d, wk_sp_from_dp);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
sol_f = Zero(); sol_f = Zero();
@@ -129,6 +138,7 @@ NAMESPACE_BEGIN(Grid);
(*guesser)(src_f, sol_f); (*guesser)(src_f, sol_f);
//Inner CG //Inner CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
CG_f.Tolerance = inner_tol; CG_f.Tolerance = inner_tol;
InnerCGtimer.Start(); InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f); CG_f(Linop_f, src_f, sol_f);
@@ -137,7 +147,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution //Convert sol back to double and add to double prec solution
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f); precisionChange(tmp_d, sol_f, wk_dp_from_sp);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d); axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -149,6 +159,7 @@ NAMESPACE_BEGIN(Grid);
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d); CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete; TotalFinalStepIterations = CG_d.IterationsToComplete;
TrueResidual = CG_d.TrueResidual;
TotalTimer.Stop(); TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;

View File

@@ -47,14 +47,18 @@ public:
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose; int verbose;
MultiShiftFunction shifts; MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) :
MaxIterations(maxit), MaxIterations(maxit),
shifts(_shifts) shifts(_shifts)
{ {
verbose=1; verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
} }
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
@@ -125,6 +129,17 @@ public:
// Residuals "r" are src // Residuals "r" are src
// First search direction "p" is also src // First search direction "p" is also src
cp = norm2(src); cp = norm2(src);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s]; rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
@@ -168,6 +183,9 @@ public:
axpby(psi[s],0.,-bs[s]*alpha[s],src,src); axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
} }
std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
/////////////////////////////////////// ///////////////////////////////////////
// Timers // Timers
/////////////////////////////////////// ///////////////////////////////////////
@@ -270,6 +288,7 @@ public:
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){ if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz]; RealD css = c * z[s][iz]* z[s][iz];
@@ -299,7 +318,8 @@ public:
axpy(r,-alpha[s],src,tmp); axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r); RealD rn = norm2(r);
RealD cn = norm2(src); RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<< TrueResidualShift[s] <<std::endl;
} }
std::cout << GridLogMessage << "Time Breakdown "<<std::endl; std::cout << GridLogMessage << "Time Breakdown "<<std::endl;

View File

@@ -0,0 +1,411 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//Linop to add shift to input linop, used in cleanup CG
namespace ConjugateGradientMultiShiftMixedPrecSupport{
template<typename Field>
class ShiftedLinop: public LinearOperatorBase<Field>{
public:
LinearOperatorBase<Field> &linop_base;
RealD shift;
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
axpy(out, shift, in, out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
};
};
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF r_f(SinglePrecGrid);
FieldF p_f(SinglePrecGrid);
FieldF tmp_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
FieldF src_f(SinglePrecGrid);
precisionChange(src_f, src_d, wk_f_from_d);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
r_f=src_f; //residual maintained in single
p_f=src_f;
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
//MdagM+m[0]
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
axpy(mmp_f,mass[0],p_f,mmp_f);
RealD rn = norm2(p_f);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_f,b,mmp_f,r_f);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
a = c /cp;
//Update double precision search direction by residual
PrecChangeTimer.Start();
precisionChange(r_d, r_f, wk_d_from_f);
PrecChangeTimer.Stop();
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_d[s],a,ps_d[s],r_d);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
}
}
}
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
d=real(innerProduct(p_f,mmp_f));
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_f,mass[0],p_f,mmp_f);
AXPYTimer.Stop();
RealD rn = norm2(p_f);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update double precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
}
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
AXPYTimer.Stop();
c = c_f;
if(k % ReliableUpdateFreq == 0){
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
PrecChangeTimer.Start();
precisionChange(r_f, r_d, wk_f_from_d);
PrecChangeTimer.Stop();
c = c_d;
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -33,16 +33,19 @@ namespace Grid {
template<class Field> template<class Field>
class ZeroGuesser: public LinearFunction<Field> { class ZeroGuesser: public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
}; };
template<class Field> template<class Field>
class DoNothingGuesser: public LinearFunction<Field> { class DoNothingGuesser: public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { }; virtual void operator()(const Field &src, Field &guess) { };
}; };
template<class Field> template<class Field>
class SourceGuesser: public LinearFunction<Field> { class SourceGuesser: public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = src; }; virtual void operator()(const Field &src, Field &guess) { guess = src; };
}; };
@@ -54,15 +57,24 @@ class DeflatedGuesser: public LinearFunction<Field> {
private: private:
const std::vector<Field> &evec; const std::vector<Field> &evec;
const std::vector<RealD> &eval; const std::vector<RealD> &eval;
const unsigned int N;
public: public:
using LinearFunction<Field>::operator();
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
: DeflatedGuesser(_evec, _eval, _evec.size())
{}
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N)
{
assert(evec.size()==eval.size());
assert(N <= evec.size());
}
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
guess = Zero(); guess = Zero();
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const Field& tmp = evec[i]; const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
@@ -79,6 +91,7 @@ private:
const std::vector<RealD> &eval_coarse; const std::vector<RealD> &eval_coarse;
public: public:
using LinearFunction<FineField>::operator();
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace, LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse, const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse) const std::vector<RealD> &_eval_coarse)

View File

@@ -37,139 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
std::vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
thread_region
{
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.View();
thread_for(ss, grid->oSites(),{
vobj B = Zero();
for(int k=k0; k<k1; ++k){
auto basis_k = basis[k].View();
B +=Qt(j,k) * basis_k[ss];
}
result_v[ss] = B;
});
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@@ -279,7 +146,7 @@ public:
RealD _eresid, // resid in lmdue deficit RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1, int _MinRestart=0, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester), SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@@ -295,7 +162,7 @@ public:
RealD _eresid, // resid in lmdue deficit RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1, int _MinRestart=0, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester), SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@@ -344,7 +211,7 @@ until convergence
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
assert(grid == evec[0].Grid()); assert(grid == evec[0].Grid());
GridLogIRL.TimingMode(1); // GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -369,14 +236,17 @@ until convergence
{ {
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
const int _MAX_ITER_IRL_MEVAPP_ = 50; const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) { for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n); normalise(src_n);
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -574,11 +444,11 @@ until convergence
/* Saad PP. 195 /* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do: 2. For k = 1,2,...,m Do:
3. wk:=Avkβkv_{k1} 3. wk:=Avk - b_k v_{k-1}
4. αk:=(wk,vk) // 4. ak:=(wk,vk) //
5. wk:=wkαkvk // wk orthog vk 5. wk:=wk-akvk // wk orthog vk
6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop 6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
7. vk+1 := wk/βk+1 7. vk+1 := wk/b_k+1
8. EndDo 8. EndDo
*/ */
void step(std::vector<RealD>& lmd, void step(std::vector<RealD>& lmd,
@@ -586,6 +456,7 @@ until convergence
std::vector<Field>& evec, std::vector<Field>& evec,
Field& w,int Nm,int k) Field& w,int Nm,int k)
{ {
std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
assert( k< Nm ); assert( k< Nm );
@@ -597,20 +468,20 @@ until convergence
if(k>0) w -= lme[k-1] * evec[k-1]; if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk) ComplexD zalph = innerProduct(evec_k,w);
RealD alph = real(zalph); RealD alph = real(zalph);
w = w - alph * evec_k;// 5. wk:=wkαkvk w = w - alph * evec_k;
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop RealD beta = normalise(w);
// 7. vk+1 := wk/βk+1
lmd[k] = alph; lmd[k] = alph;
lme[k] = beta; lme[k] = beta;
if (k>0 && k % orth_period == 0) { if ( (k>0) && ( (k % orth_period) == 0 )) {
std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl; std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
} }
if(k < Nm-1) evec[k+1] = w; if(k < Nm-1) evec[k+1] = w;
@@ -618,6 +489,8 @@ until convergence
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl; std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny ) if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl; std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
} }
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,

View File

@@ -44,6 +44,7 @@ public:
int, MinRes); // Must restart int, MinRes); // Must restart
}; };
//This class is the input parameter class for some testing programs
struct LocalCoherenceLanczosParams : Serializable { struct LocalCoherenceLanczosParams : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@@ -67,6 +68,7 @@ public:
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > { class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public: public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector; typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField; typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@@ -97,6 +99,7 @@ public:
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > { class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public: public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector; typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField; typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@@ -153,6 +156,7 @@ public:
_coarse_relax_tol(coarse_relax_tol) _coarse_relax_tol(coarse_relax_tol)
{ }; { };
//evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
CoarseField v(B); CoarseField v(B);
@@ -179,8 +183,16 @@ public:
if( (vv<eresid*eresid) ) conv = 1; if( (vv<eresid*eresid) ) conv = 1;
return conv; return conv;
} }
//This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
//applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
//evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
//As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
//We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
evalMaxApprox = 1.0; //cf above
GridBase *FineGrid = _subspace[0].Grid(); GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard(); int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard; FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@@ -199,13 +211,13 @@ public:
eval = vnum/vden; eval = vnum/vden;
fv -= eval*fB; fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0); RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
std::cout.precision(13); std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] " std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")" <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid
<<std::endl; <<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1; if( (vv<eresid*eresid) ) return 1;
return 0; return 0;
} }
@@ -283,6 +295,10 @@ public:
evals_coarse.resize(0); evals_coarse.resize(0);
}; };
//The block inner product is the inner product on the fine grid locally summed over the blocks
//to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
//vectors under the block inner product. This step must be performed after computing the fine grid
//eigenvectors and before computing the coarse grid eigenvectors.
void Orthogonalise(void ) { void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid); CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl; std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@@ -326,6 +342,8 @@ public:
} }
} }
//While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
assert(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
@@ -374,18 +392,23 @@ public:
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
//cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
//cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
//relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid, int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
Chebyshev<FineField> Cheby(cheby_op); Chebyshev<FineField> Cheby(cheby_op); //Chebyshev of fine operator on fine grid
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace); ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth); Chebyshev<FineField> ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
evals_coarse.resize(Nm); evals_coarse.resize(Nm);
@@ -393,6 +416,7 @@ public:
CoarseField src(_CoarseGrid); src=1.0; CoarseField src(_CoarseGrid); src=1.0;
//Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@@ -403,6 +427,14 @@ public:
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl; std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
} }
} }
//Get the fine eigenvector 'i' by reconstruction
void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
blockPromote(evec_coarse[i],evec,subspace);
eval = evals_coarse[i];
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -33,25 +33,77 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver // Take a matrix and form an NE solver calling a Herm solver
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public OperatorFunction<Field>{ template<class Field> class NormalEquations {
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public: public:
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Wrap the usual normal equations trick // Wrap the usual normal equations trick
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
: _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){ void operator() (const Field &in, Field &out){
Field src(in.Grid()); Field src(in.Grid());
Field tmp(in.Grid());
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Matrix.Mdag(in,src); _Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in _Guess(src,out);
_HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in
}
};
template<class Field> class HPDSolver {
private:
LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
HPDSolver(LinearOperatorBase<Field> &Matrix,
OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
_Guess(in,out);
_HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in
}
};
template<class Field> class MdagMSolver {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Guess(in,out);
_HermitianSolver(MdagMOp,in,out); // Mdag M out = Mdag in
} }
}; };

View File

@@ -30,12 +30,14 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) { std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na; evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox; return evalMaxApprox;
} }
evalMaxApprox = na; evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp; src_n = tmp;
} }
assert(0); assert(0);

View File

@@ -38,34 +38,41 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field> #define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
int mmax; int mmax;
int nstep; int nstep;
int steps; int steps;
int level;
GridStopWatch PrecTimer; GridStopWatch PrecTimer;
GridStopWatch MatTimer; GridStopWatch MatTimer;
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol), Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec), Preconditioner(Prec),
mmax(_mmax), mmax(_mmax),
nstep(_nstep) nstep(_nstep)
{ {
level=1;
verbose=1; verbose=1;
}; };
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi=Zero(); psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
@@ -84,9 +91,9 @@ public:
steps=0; steps=0;
for(int k=0;k<MaxIterations;k++){ for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(Linop,src,psi,rsq); cp=GCRnStep(src,psi,rsq);
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl; GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) { if(cp<rsq) {
@@ -95,24 +102,26 @@ public:
Linop.HermOp(psi,r); Linop.HermOp(psi,r);
axpy(r,-1.0,src,r); axpy(r,-1.0,src,r);
RealD tr = norm2(r); RealD tr = norm2(r);
std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq) << " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq) << " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl; << " target " <<Tolerance <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl; GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl; /*
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl; GCRLogLevel<<"PGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl; GCRLogLevel<<"PGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
*/
return; return;
} }
} }
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0); // assert(0);
} }
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp; RealD cp;
RealD a, b; RealD a, b;
@@ -134,6 +143,7 @@ public:
std::vector<Field> p(mmax,grid); std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax); std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
////////////////////////////////// //////////////////////////////////
// initial guess x0 is taken as nonzero. // initial guess x0 is taken as nonzero.
@@ -143,38 +153,26 @@ public:
Linop.HermOpAndNorm(psi,Az,zAz,zAAz); Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop(); MatTimer.Stop();
LinalgTimer.Start(); LinalgTimer.Start();
r=src-Az; r=src-Az;
LinalgTimer.Stop(); LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
///////////////////// /////////////////////
// p = Prec(r) // p = Prec(r)
///////////////////// /////////////////////
PrecTimer.Start(); PrecTimer.Start();
Preconditioner(r,z); Preconditioner(r,z);
PrecTimer.Stop(); PrecTimer.Stop();
MatTimer.Start();
Linop.HermOp(z,tmp);
MatTimer.Stop();
LinalgTimer.Start();
ttmp=tmp;
tmp=tmp-r;
LinalgTimer.Stop();
/*
std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl;
*/
MatTimer.Start(); MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz); Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop(); MatTimer.Stop();
LinalgTimer.Start(); LinalgTimer.Start();
//p[0],q[0],qq[0] //p[0],q[0],qq[0]
p[0]= z; p[0]= z;
q[0]= Az; q[0]= Az;
@@ -200,11 +198,12 @@ public:
cp = axpy_norm(r,-a,q[peri_k],r); cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop(); LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){ if((k==nstep-1)||(cp<rsq)){
return cp; return cp;
} }
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start(); PrecTimer.Start();
Preconditioner(r,z);// solve Az = r Preconditioner(r,z);// solve Az = r
@@ -212,12 +211,9 @@ public:
MatTimer.Start(); MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz); Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop(); MatTimer.Stop();
LinalgTimer.Start(); LinalgTimer.Start();
tmp=tmp-r;
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az; q[peri_kp]=Az;
p[peri_kp]=z; p[peri_kp]=z;

View File

@@ -0,0 +1,242 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b;
// ComplexD zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
// zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,371 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithmsf/iterative/QuasiMinimalResidual.h
Copyright (C) 2019
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
RealD innerG5ProductReal(Field &l, Field &r)
{
Gamma G5(Gamma::Algebra::Gamma5);
Field tmp(l.Grid());
// tmp = G5*r;
G5R5(tmp,r);
ComplexD ip =innerProduct(l,tmp);
std::cout << "innerProductRealG5R5 "<<ip<<std::endl;
return ip.real();
}
template<class Field>
class QuasiMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge;
RealD Tolerance;
Integer MaxIterations;
Integer IterationCount;
QuasiMinimalResidual(RealD tol,
Integer maxit,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, ErrorOnNoConverge(err_on_no_conv)
{};
#if 1
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
RealD resid;
IterationCount=0;
RealD rho, rho_1, xi, gamma, gamma_1, theta, theta_1;
RealD eta, delta, ep, beta;
GridBase *Grid = b.Grid();
Field r(Grid), d(Grid), s(Grid);
Field v(Grid), w(Grid), y(Grid), z(Grid);
Field v_tld(Grid), w_tld(Grid), y_tld(Grid), z_tld(Grid);
Field p(Grid), q(Grid), p_tld(Grid);
Real normb = norm2(b);
LinOp.Op(x,r); r = b - r;
assert(normb> 0.0);
resid = norm2(r)/normb;
if (resid <= Tolerance) {
return;
}
v_tld = r;
y = v_tld;
rho = norm2(y);
// Take Gamma5 conjugate
// Gamma G5(Gamma::Algebra::Gamma5);
// G5R5(w_tld,r);
// w_tld = G5* v_tld;
w_tld=v_tld;
z = w_tld;
xi = norm2(z);
gamma = 1.0;
eta = -1.0;
theta = 0.0;
for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests
assert( rho != 0.0);
assert( xi != 0.0);
v = (1. / rho) * v_tld;
y = (1. / rho) * y;
w = (1. / xi) * w_tld;
z = (1. / xi) * z;
ComplexD Zdelta = innerProduct(z, y); // Complex?
std::cout << "Zdelta "<<Zdelta<<std::endl;
delta = Zdelta.real();
y_tld = y;
z_tld = z;
if (i > 1) {
p = y_tld - (xi * delta / ep) * p;
q = z_tld - (rho * delta / ep) * q;
} else {
p = y_tld;
q = z_tld;
}
LinOp.Op(p,p_tld); // p_tld = A * p;
ComplexD Zep = innerProduct(q, p_tld);
ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit
assert(abs(ep)>0);
beta = ep / delta;
assert(abs(beta)>0);
v_tld = p_tld - beta * v;
y = v_tld;
rho_1 = rho;
rho = norm2(y);
LinOp.AdjOp(q,w_tld);
w_tld = w_tld - beta * w;
z = w_tld;
xi = norm2(z);
gamma_1 = gamma;
theta_1 = theta;
theta = rho / (gamma_1 * beta);
gamma = 1.0 / sqrt(1.0 + theta * theta);
std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
if (i > 1) {
d = eta * p + (theta_1 * theta_1 * gamma * gamma) * d;
s = eta * p_tld + (theta_1 * theta_1 * gamma * gamma) * s;
} else {
d = eta * p;
s = eta * p_tld;
}
x =x+d; // update approximation vector
r =r-s; // compute residual
if ((resid = norm2(r) / normb) <= Tolerance) {
return;
}
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
}
assert(0);
return; // no convergence
}
#else
// QMRg5 SMP thesis
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
// Real scalars
GridBase *grid = b.Grid();
Field r(grid);
Field p_m(grid), p_m_minus_1(grid), p_m_minus_2(grid);
Field v_m(grid), v_m_minus_1(grid), v_m_plus_1(grid);
Field tmp(grid);
RealD w;
RealD z1, z2;
RealD delta_m, delta_m_minus_1;
RealD c_m_plus_1, c_m, c_m_minus_1;
RealD s_m_plus_1, s_m, s_m_minus_1;
RealD alpha, beta, gamma, epsilon;
RealD mu, nu, rho, theta, xi, chi;
RealD mod2r, mod2b;
RealD tau2, target2;
mod2b=norm2(b);
/////////////////////////
// Initial residual
/////////////////////////
LinOp.Op(x,tmp);
r = b - tmp;
/////////////////////////
// \mu = \rho = |r_0|
/////////////////////////
mod2r = norm2(r);
rho = sqrt( mod2r);
mu=rho;
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
/////////////////////////
// Zero negative history
/////////////////////////
v_m_plus_1 = Zero();
v_m_minus_1 = Zero();
p_m_minus_1 = Zero();
p_m_minus_2 = Zero();
// v0
v_m = (1.0/rho)*r;
/////////////////////////
// Initial coeffs
/////////////////////////
delta_m_minus_1 = 1.0;
c_m_minus_1 = 1.0;
c_m = 1.0;
s_m_minus_1 = 0.0;
s_m = 0.0;
/////////////////////////
// Set up convergence check
/////////////////////////
tau2 = mod2r;
target2 = mod2b * Tolerance*Tolerance;
for(int iter = 0 ; iter < MaxIterations; iter++){
/////////////////////////
// \delta_m = (v_m, \gamma_5 v_m)
/////////////////////////
delta_m = innerG5ProductReal(v_m,v_m);
std::cout << "QuasiMinimalResidual delta_m "<< delta_m<<std::endl;
/////////////////////////
// tmp = A v_m
/////////////////////////
LinOp.Op(v_m,tmp);
/////////////////////////
// \alpha = (v_m, \gamma_5 temp) / \delta_m
/////////////////////////
alpha = innerG5ProductReal(v_m,tmp);
alpha = alpha/delta_m ;
std::cout << "QuasiMinimalResidual alpha "<< alpha<<std::endl;
/////////////////////////
// \beta = \rho \delta_m / \delta_{m-1}
/////////////////////////
beta = rho * delta_m / delta_m_minus_1;
std::cout << "QuasiMinimalResidual beta "<< beta<<std::endl;
/////////////////////////
// \tilde{v}_{m+1} = temp - \alpha v_m - \beta v_{m-1}
/////////////////////////
v_m_plus_1 = tmp - alpha*v_m - beta*v_m_minus_1;
///////////////////////////////
// \rho = || \tilde{v}_{m+1} ||
///////////////////////////////
rho = sqrt( norm2(v_m_plus_1) );
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
///////////////////////////////
// v_{m+1} = \tilde{v}_{m+1}
///////////////////////////////
v_m_plus_1 = (1.0 / rho) * v_m_plus_1;
////////////////////////////////
// QMR recurrence coefficients.
////////////////////////////////
theta = s_m_minus_1 * beta;
gamma = c_m_minus_1 * beta;
epsilon = c_m * gamma + s_m * alpha;
xi = -s_m * gamma + c_m * alpha;
nu = sqrt( xi*xi + rho*rho );
c_m_plus_1 = fabs(xi) / nu;
if ( xi == 0.0 ) {
s_m_plus_1 = 1.0;
} else {
s_m_plus_1 = c_m_plus_1 * rho / xi;
}
chi = c_m_plus_1 * xi + s_m_plus_1 * rho;
std::cout << "QuasiMinimalResidual coeffs "<< theta <<" "<<gamma<<" "<< epsilon<<" "<< xi<<" "<< nu<<std::endl;
std::cout << "QuasiMinimalResidual coeffs "<< chi <<std::endl;
////////////////////////////////
//p_m=(v_m - \epsilon p_{m-1} - \theta p_{m-2}) / \chi
////////////////////////////////
p_m = (1.0/chi) * v_m - (epsilon/chi) * p_m_minus_1 - (theta/chi) * p_m_minus_2;
////////////////////////////////////////////////////////////////
// \psi = \psi + c_{m+1} \mu p_m
////////////////////////////////////////////////////////////////
x = x + ( c_m_plus_1 * mu ) * p_m;
////////////////////////////////////////
//
////////////////////////////////////////
mu = -s_m_plus_1 * mu;
delta_m_minus_1 = delta_m;
c_m_minus_1 = c_m;
c_m = c_m_plus_1;
s_m_minus_1 = s_m;
s_m = s_m_plus_1;
////////////////////////////////////
// Could use pointer swizzle games.
////////////////////////////////////
v_m_minus_1 = v_m;
v_m = v_m_plus_1;
p_m_minus_2 = p_m_minus_1;
p_m_minus_1 = p_m;
/////////////////////////////////////
// Convergence checks
/////////////////////////////////////
z1 = RealD(iter+1.0);
z2 = z1 + 1.0;
tau2 = tau2 *( z2 / z1 ) * s_m * s_m;
std::cout << " QuasiMinimumResidual iteration "<< iter<<std::endl;
std::cout << " QuasiMinimumResidual tau bound "<< tau2<<std::endl;
// Compute true residual
mod2r = tau2;
if ( 1 || (tau2 < (100.0 * target2)) ) {
LinOp.Op(x,tmp);
r = b - tmp;
mod2r = norm2(r);
std::cout << " QuasiMinimumResidual true residual is "<< mod2r<<std::endl;
}
if ( mod2r < target2 ) {
std::cout << " QuasiMinimumResidual has converged"<<std::endl;
return;
}
}
}
#endif
};
NAMESPACE_END(Grid);

View File

@@ -132,6 +132,31 @@ namespace Grid {
(*this)(_Matrix,in,out,guess); (*this)(_Matrix,in,out,guess);
} }
void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
}
// James can write his own deflated guesser
// with optimised code for the inner products
// RedBlackSolveSplitGrid();
// RedBlackSolve(_Matrix,src_o,sol_o);
void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++) {
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
}
}
template<class Guesser> template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{ {
@@ -150,22 +175,27 @@ namespace Grid {
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Prepare RedBlack source // Prepare RedBlack source
//////////////////////////////////////////////// ////////////////////////////////////////////////
for(int b=0;b<nblock;b++){ RedBlackSource(_Matrix,in,src_o);
RedBlackSource(_Matrix,in[b],tmp,src_o[b]); // for(int b=0;b<nblock;b++){
} // RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
// }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Make the guesses // Make the guesses
//////////////////////////////////////////////// ////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid); if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
if(useSolnAsInitGuess) { if(useSolnAsInitGuess) {
for(int b=0;b<nblock;b++){
pickCheckerboard(Odd, sol_o[b], out[b]); pickCheckerboard(Odd, sol_o[b], out[b]);
}
} else { } else {
guess(src_o[b],sol_o[b]); guess(src_o, sol_o);
} }
if ( subGuess ) { if ( subGuess ) {
for(int b=0;b<nblock;b++){
guess_save[b] = sol_o[b]; guess_save[b] = sol_o[b];
} }
} }
@@ -405,6 +435,70 @@ namespace Grid {
} }
}; };
template<class Field> class NonHermitianSchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field>
{
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
NonHermitianSchurRedBlackDiagMooeeSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd , src_o, src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
}
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
Field src_e_i(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd );
}
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd);
}
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
{
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv // Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
@@ -482,5 +576,76 @@ namespace Grid {
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
} }
}; };
template<class Field> class NonHermitianSchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field>
{
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
NonHermitianSchurRedBlackDiagTwoSolve(OperatorFunction<Field>& RBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(RBSolver, initSubGuess, _solnAsInitGuess) {};
virtual void RedBlackSource(Matrix& _Matrix, const Field& src, Field& src_e, Field& src_o)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even, src_e, src);
pickCheckerboard(Odd , src_o, src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
}
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
{
GridBase* grid = _Matrix.RedBlackGrid();
GridBase* fgrid = _Matrix.Grid();
Field sol_o_i(grid);
Field tmp(grid);
Field sol_e(grid);
////////////////////////////////////////////////
// MooeeInv due to pecond
////////////////////////////////////////////////
_Matrix.MooeeInv(sol_o, tmp);
sol_o_i = tmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even );
tmp = src_e - tmp; assert( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd );
};
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{
NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o);
};
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
{
NonHermitianSchurDiagTwoOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o);
}
};
} }
#endif #endif

View File

@@ -26,114 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H #pragma once
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#ifdef POINTER_CACHE
class PointerCache {
private:
static const int Ncache=8;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
#endif
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class alignedAllocator { class alignedAllocator {
public: public:
@@ -157,88 +53,131 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type __n) { void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
MemoryManager::CpuFree((void *)__p,bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
} }
// FIXME: hack for the copy constructor, eventually it must be avoided // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); }; void construct(pointer __p, const _Tp& __val) { assert(0);};
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////
// Unified virtual memory
//////////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class uvmAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::SharedFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Device memory
////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class devAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
devAllocator() throw() { }
devAllocator(const devAllocator&) throw() { }
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
~devAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::AcceleratorFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>; #ifdef ACCELERATOR_CSHIFT
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Cshift on device
template<class T> using commVector = std::vector<T,alignedAllocator<T> >; template<class T> using cshiftAllocator = devAllocator<T>;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; #else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@@ -0,0 +1,300 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
}
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
}
void MemoryManager::InitMessage(void) {
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
cacheBytes -= entries[v].bytes;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
cacheBytes += bytes;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
cacheBytes -= entries[e].bytes;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,182 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void Print(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,505 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
LRUinsert(AccCache);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
void MemoryManager::PrintState(void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
} else {
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,27 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::PrintState(void* CpuPtr)
{
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@@ -6,66 +6,6 @@ NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
#ifdef POINTER_CACHE
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
#endif
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__

View File

@@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@@ -47,20 +47,19 @@ public:
// Give Lattice access // Give Lattice access
template<class object> friend class Lattice; template<class object> friend class Lattice;
GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {}; GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;};
GridBase(const Coordinate & processor_grid, GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent, const CartesianCommunicator &parent,
int &split_rank) int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {}; : CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};
GridBase(const Coordinate & processor_grid, GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent) const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {}; : CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};
virtual ~GridBase() = default; virtual ~GridBase() = default;
// Physics Grid information. // Physics Grid information.
Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes. Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
@@ -81,6 +80,8 @@ public:
Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic;
Coordinate _checker_dim_mask;
public: public:

View File

@@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -104,6 +105,7 @@ public:
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -114,6 +116,8 @@ public:
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions _fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];

View File

@@ -36,11 +36,27 @@ static const int CbBlack=1;
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;

View File

@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
bool Stencil_force_mpi = true;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@@ -36,6 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
class CartesianCommunicator : public SharedMemory { class CartesianCommunicator : public SharedMemory {
public: public:
@@ -108,12 +109,15 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Reduction // Reduction
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void GlobalMax(RealD &);
void GlobalMax(RealF &);
void GlobalSum(RealF &); void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N); void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &); void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N); void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &); void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &); void GlobalSum(uint64_t &);
void GlobalSumVector(uint64_t*,int N);
void GlobalSum(ComplexF &c); void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N); void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c); void GlobalSum(ComplexD &c);
@@ -137,21 +141,6 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,

View File

@@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
// other comms schemes are ok
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
#else
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0); assert(0);
@@ -255,6 +263,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){ void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);
@@ -263,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@@ -290,60 +312,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0); unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0); unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { // Enforce no UVM in comms, device or host OK
MPI_Request xrq; assert(acceleratorIsCommunicable(xmit));
MPI_Request rrq; assert(acceleratorIsCommunicable(recv));
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from, recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
assert(ierr==0); assert(ierr==0);
}
}
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -378,19 +368,28 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor); assert(from != _processor);
assert(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag;
if ( gfrom ==MPI_UNDEFINED) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq); tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( gdest == MPI_UNDEFINED ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq); tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
} }
if ( CommunicatorPolicy == CommunicatorPolicySequential ) { if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@@ -399,16 +398,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{ {
// std::cout << "Copy Synchronised\n"<<std::endl;
acceleratorCopySynchronise();
int nreq=list.size(); int nreq=list.size();
if (nreq==0) return; if (nreq==0) return;
@@ -418,6 +412,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
assert(ierr==0); assert(ierr==0);
list.resize(0); list.resize(0);
} }
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
//{
//}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
@@ -479,5 +480,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -67,24 +67,18 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
CartesianCommunicator::~CartesianCommunicator(){} CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalMax(float &){}
void CartesianCommunicator::GlobalMax(double &){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalSum(uint32_t &){} void CartesianCommunicator::GlobalSum(uint32_t &){}
void CartesianCommunicator::GlobalSum(uint64_t &){} void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){} void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node // Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -95,20 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@@ -136,10 +116,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -149,13 +125,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall);
} }
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};

View File

@@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
if (heap_bytes >= heap_size) { if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl; std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;

View File

@@ -41,9 +41,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/shm.h> #include <sys/shm.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@@ -99,12 +96,13 @@ public:
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Provide shared memory facilities off comm world // Provide shared memory facilities off comm world
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes); static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -7,6 +7,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -29,8 +30,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#ifdef GRID_NVCC #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@@ -47,7 +54,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
#else
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
#endif
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
@@ -61,6 +73,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize ); assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@@ -155,6 +168,59 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm); if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm); else OptimalCommunicatorSharedMemory(processors,optimal_comm);
} }
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -221,17 +287,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
std::vector<int> processor_coor(ndimension); Coordinate processor_coor(ndimension);
std::vector<int> WorldDims = processors.toVector(); Coordinate WorldDims = processors;
std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension); Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension); Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension); Coordinate HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){ GetShmDims(WorldDims,ShmDims);
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
@@ -281,27 +343,16 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims // Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
Coordinate processor_coor(ndimension); Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1); Coordinate NodeDims (ndimension); Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension);
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension); Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
GetShmDims(WorldDims,ShmDims);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -399,7 +450,47 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC #if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@@ -418,41 +509,74 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// e.g. DGX1, supermicro board, // e.g. DGX1, supermicro board,
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
cudaSetDevice(WorldShmRank);
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
#ifndef GRID_MPI3_SHM_NONE
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
cudaIpcMemHandle_t handle; void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf); auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
}
#endif
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
@@ -468,23 +592,68 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); thisBuf = nullptr;
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
}
WorldShmCommBufs[r] = thisBuf; WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
} }
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
} }
#endif
#else #else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
@@ -612,7 +781,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif #endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) { if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
@@ -656,16 +824,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
#ifdef GRID_NVCC #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
cudaMemset(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
#else #else
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ {
#ifdef GRID_NVCC #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); acceleratorCopyToDevice(src,dest,bytes);
#else #else
bcopy(src,dest,bytes); bcopy(src,dest,bytes);
#endif #endif
@@ -684,7 +852,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
#else
MPI_Comm_split(comm, rank, 0, &ShmComm);
#endif
MPI_Comm_rank(ShmComm ,&ShmRank); MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize); MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize); ShmCommBufs.resize(ShmSize);
@@ -714,7 +886,18 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
SharedMemoryTest(); #ifdef GRID_SHM_FORCE_MPI
// Hide the shared memory path between ranks
{
for(int r=0;r<size;r++){
if ( r!=rank ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
#endif
//SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier

View File

@@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryNone: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap // Hugetlbfs mapping intended, use anonymous mmap
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if 1
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
}; };
#endif
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality

View File

@@ -49,4 +49,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif #endif
NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{
return Cshift(closure(expr),dim,shift);
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -29,11 +29,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
static Vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int bo = n*e2; int bo = n*e2;
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
} }
} }
} else { } else {
@@ -65,14 +67,26 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int o = n*stride; int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
thread_for(i,ent,{ {
buffer[table[i].first]=rhs_v[table[i].second]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@@ -95,43 +109,80 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
thread_for_collapse(2,n,e1,{ #ifdef ACCELERATOR_CSHIFT
for(int b=0;b<e2;b++){ autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}
}); });
} else { #else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
// Case of SIMD split AND checker dim cannot currently be hit, except in vobj temp =rhs_v[so+o+b];
// Test_cshift_red_black code. extract<vobj>(temp,pointers,offset);
std::cout << " Dense packed buffer WARNING " <<std::endl; });
thread_for_collapse(2,n,e1,{ #endif
for(int b=0;b<e2;b++){ } else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
Coordinate coor;
int o=n*n1; int o=n*n1;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
} }
}); });
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -145,7 +196,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent =0; int ent =0;
if ( cbmask ==0x3 ) { if ( cbmask ==0x3 ) {
@@ -154,7 +206,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@@ -165,16 +217,27 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
thread_for(i,ent,{ auto buffer_p = & buffer[0];
rhs_v[table[i].first]=buffer[table[i].second]; auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@@ -194,21 +257,33 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
auto rhs_v = rhs.View(); int _slice_stride = rhs.Grid()->_slice_stride[dimension];
thread_for_collapse(2,n,e1,{ int _slice_block = rhs.Grid()->_slice_block[dimension];
for(int b=0;b<e2;b++){ #ifdef ACCELERATOR_CSHIFT
int o = n*rhs.Grid()->_slice_stride[dimension]; autoView( rhs_v , rhs, AcceleratorWrite);
int offset = b+n*rhs.Grid()->_slice_block[dimension]; accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View(); assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
@@ -225,6 +300,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -239,14 +315,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} else { } else {
@@ -255,23 +333,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{ thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second]; lhs_v[table[i].first]=rhs_v[table[i].second];
}); });
#endif
}
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@@ -285,29 +372,41 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{ thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#endif
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////

View File

@@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -121,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size); static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
commVector<vobj> recv_buf(buffer_size); static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else { } else {
int words = send_buf.size(); int words = buffer_size;
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier(); grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf_extract[i][0], (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier(); grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
@@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -0,0 +1,4 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h> #include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>
@@ -36,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h> #include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>
@@ -43,4 +45,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_rng.h> #include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h> #include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp> Author: neo <cossu@post.kek.jp>
Author: Christoph Lehner <christoph@lhnr.de
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -41,9 +42,24 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#ifdef GRID_SIMT
// drop to scalar in SIMT; cleaner in fact
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, accelerator_inline vobj predicatedWhere(const iobj &predicate,
const robj &iffalse) { const vobj &iftrue,
const robj &iffalse)
{
Integer mask = TensorRemove(predicate);
typename std::remove_const<vobj>::type ret= iffalse;
if (mask) ret=iftrue;
return ret;
}
#else
template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate,
const vobj &iftrue,
const robj &iffalse)
{
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@@ -67,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
merge(ret, falsevals); merge(ret, falsevals);
return ret; return ret;
} }
#endif
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
@@ -80,26 +97,62 @@ struct getVectorType<Lattice<T> >{
//-- recursive evaluation of expressions; -- //-- recursive evaluation of expressions; --
// handle leaves of syntax tree // handle leaves of syntax tree
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template<class sobj> accelerator_inline template<class sobj,
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg) sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
{ {
return arg[ss]; return arg(ss);
}
////////////////////////////////////////////
//-- recursive evaluation of expressions; --
// whole vector return, used only for expression return type inference
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj vecEval(const uint64_t ss, const sobj &arg)
{
return arg;
} }
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg)
{ {
auto view = arg.View(); return arg[ss];
return view[ss];
} }
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand // handle nodes in syntax tree- eval one operand
// vecEval needed (but never called as all expressions offloaded) to infer the return type
// in SIMT contexts of closure.
///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
{
return expr.op.func( vecEval(ss, expr.arg1) );
}
// vecEval two operands
template <typename Op, typename T1, typename T2> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
{
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
}
// vecEval three operands
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
{
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
}
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand coalesced
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline template <typename Op, typename T1> accelerator_inline
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr) auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
@@ -107,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
{ {
return expr.op.func( eval(ss, expr.arg1) ); return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
// eval two operands // eval two operands
///////////////////////
template <typename Op, typename T1, typename T2> accelerator_inline template <typename Op, typename T1, typename T2> accelerator_inline
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr) auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
{ {
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
} }
///////////////////////
// eval three operands // eval three operands
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) -> decltype(expr.op.func(eval(ss, expr.arg1),
eval(ss, expr.arg2),
eval(ss, expr.arg3)))
{ {
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); #ifdef GRID_SIMT
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
typedef typename std::remove_reference<rvobj>::type vobj;
const int Nsimd = vobj::vector_type::Nsimd();
auto vpred = vecEval(ss,expr.arg1);
ExtractBuffer<Integer> mask(Nsimd);
extract<vInteger, Integer>(TensorRemove(vpred), mask);
int s = acceleratorSIMTlane(Nsimd);
return expr.op.func(mask[s],
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#else
return expr.op.func(eval(ss, expr.arg1),
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#endif
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -179,16 +250,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
{
}
template <typename Op, typename T1> inline template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{ {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, expr.arg1); // recurse AST
} }
template <typename Op, typename T1, typename T2> inline template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{ {
@@ -203,32 +270,86 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
CBFromExpression(cb, expr.arg3); // recurse AST CBFromExpression(cb, expr.arg3); // recurse AST
} }
//////////////////////////////////////////////////////////////////////////
// ViewOpen
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // rrecurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
ExpressionViewOpen(expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewClose
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
ExpressionViewClose(expr.arg3); // recurse AST
}
//////////////////////////////////////////// ////////////////////////////////////////////
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \
struct name { \ struct name { \
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \ template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a)); GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryReal, real(a));
GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a)); GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a)); GridUnopClass(UnaryAsin, asin(a));
@@ -240,10 +361,10 @@ GridUnopClass(UnaryExp, exp(a));
// Binary operators // Binary operators
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \ struct name { \
template <class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const left &lhs, const right &rhs) \ func(const _left &lhs, const _right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@@ -263,10 +384,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
// Trinary conditional op // Trinary conditional op
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \ struct name { \
template <class _predicate,class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const predicate &pred, const left &lhs, const right &rhs) \ func(const _predicate &pred, const _left &lhs, const _right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@@ -274,17 +395,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
}; };
GridTrinOpClass(TrinaryWhere, GridTrinOpClass(TrinaryWhere,
(predicatedWhere<predicate, (predicatedWhere<
typename std::remove_reference<left>::type, typename std::remove_reference<_predicate>::type,
typename std::remove_reference<right>::type>(pred, lhs,rhs))); typename std::remove_reference<_left>::type,
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
//////////////////////////////////////////// ////////////////////////////////////////////
#define GRID_UNOP(name) name
#define GRID_UNOP(name) name<decltype(eval(0, arg))> #define GRID_BINOP(name) name
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_TRINOP(name) name
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -330,22 +451,17 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj); //GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj); //GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(real, UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing // abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt); GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin); GRID_DEF_UNOP(asin, UnaryAsin);
@@ -370,29 +486,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
{ {
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr); Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
{ {
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr); Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1), -> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
eval(0, expr.arg2), vecEval(0, expr.arg2),
eval(0, expr.arg3)))> vecEval(0, expr.arg3)))>::type >
{ {
Lattice<decltype(expr.op.func(eval(0, expr.arg1), Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
eval(0, expr.arg2), vecEval(0, expr.arg2),
eval(0, expr.arg3)))> ret(expr); vecEval(0, expr.arg3)))>::type > ret(expr);
return ret; return ret;
} }
#define EXPRESSION_CLOSURE(function) \
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
auto function(Expression &expr) -> decltype(function(closure(expr))) \
{ \
return function(closure(expr)); \
}
#undef GRID_UNOP #undef GRID_UNOP
#undef GRID_BINOP #undef GRID_BINOP

View File

@@ -7,6 +7,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -55,13 +56,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
auto tmp =ret_v(ss);
mac(&tmp,&lhs_t,&rhs_t); mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
@@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@@ -120,10 +121,10 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; auto tmp =ret_v(ss);
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs); mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -178,10 +179,10 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; auto tmp =ret_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t); mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -220,11 +221,11 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.View(); autoView( x_v , x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss); auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
} }
@@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.View(); autoView( x_v , x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -28,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#define STREAMING_STORES #define STREAMING_STORES
@@ -36,129 +38,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics. // The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code // This contains extra (host resident) grid pointer data that may be accessed by host code
@@ -173,13 +52,14 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef vobj vector_object; typedef vobj vector_object;
private: private:
void dealloc(void) void dealloc(void)
{ {
alignedAllocator<vobj> alloc;
if( this->_odata_size ) { if( this->_odata_size ) {
alignedAllocator<vobj> alloc;
alloc.deallocate(this->_odata,this->_odata_size); alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr; this->_odata=nullptr;
this->_odata_size=0; this->_odata_size=0;
@@ -187,25 +67,43 @@ private:
} }
void resize(uint64_t size) void resize(uint64_t size)
{ {
alignedAllocator<vobj> alloc;
if ( this->_odata_size != size ) { if ( this->_odata_size != size ) {
alignedAllocator<vobj> alloc;
dealloc(); dealloc();
}
this->_odata_size = size; this->_odata_size = size;
if ( size ) if ( size )
this->_odata = alloc.allocate(this->_odata_size); this->_odata = alloc.allocate(this->_odata_size);
else else
this->_odata = nullptr; this->_odata = nullptr;
} }
}
public: public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
// Helper function to print the state of this object in the AccCache
void PrintCacheState(void)
{
MemoryManager::PrintState(this->_odata);
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
LatticeView<vobj> View (ViewMode mode) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this)); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
return accessor; return accessor;
} }
@@ -229,11 +127,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto exprCopy = expr;
accelerator_for(ss,me.size(),1,{ ExpressionViewOpen(exprCopy);
auto tmp = eval(ss,expr); auto me = View(AcceleratorWriteDiscard);
vstream(me[ss],tmp); accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@@ -248,11 +150,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto exprCopy = expr;
accelerator_for(ss,me.size(),1,{ ExpressionViewOpen(exprCopy);
auto tmp = eval(ss,expr); auto me = View(AcceleratorWriteDiscard);
vstream(me[ss],tmp); accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@@ -266,11 +172,15 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto exprCopy = expr;
accelerator_for(ss,me.size(),1,{ ExpressionViewOpen(exprCopy);
auto tmp = eval(ss,expr); auto me = View(AcceleratorWriteDiscard);
vstream(me[ss],tmp); accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
@@ -321,10 +231,11 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View(); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss] = r; me[ss]= r;
}); });
me.ViewClose();
return *this; return *this;
} }
@@ -334,11 +245,12 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode);
} }
// virtual ~Lattice(void) = default; // virtual ~Lattice(void) = default;
@@ -346,7 +258,7 @@ public:
void reset(GridBase* grid) { void reset(GridBase* grid) {
if (this->_grid != grid) { if (this->_grid != grid) {
this->_grid = grid; this->_grid = grid;
this->_odata.resize(grid->oSites()); this->resize(grid->oSites());
this->checkerboard = 0; this->checkerboard = 0;
} }
} }
@@ -354,7 +266,6 @@ public:
// copy constructor // copy constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(const Lattice& r){ Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid(); this->_grid = r.Grid();
resize(this->_grid->oSites()); resize(this->_grid->oSites());
*this = r; *this = r;
@@ -377,11 +288,12 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(); auto me = View(AcceleratorWriteDiscard);
auto him= r.View(); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
@@ -391,11 +303,12 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(); auto me = View(AcceleratorWriteDiscard);
auto him= r.View(); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
/////////////////////////////////////////// ///////////////////////////////////////////

View File

@@ -0,0 +1,248 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_basis.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
return;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
});
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
decltype(coalescedRead(Bp[ss])) z;
z=Zero();
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View(AcceleratorRead)) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead));
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero();
auto B=coalescedRead(zzz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
NAMESPACE_END(Grid);

View File

@@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
typedef iScalar<vInteger> vPredicate ; typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to lattice // compare lattice to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -78,9 +50,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
}); });
@@ -93,8 +65,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs); ret_v[ss]=op(lhs_v[ss],rhs);
}); });
@@ -107,8 +79,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]); ret_v[ss]=op(lhs,rhs_v[ss]);
}); });

View File

@@ -37,38 +37,19 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), {
vector_type vI;
Coordinate gcoor; Coordinate gcoor;
ExtractBuffer<scalar_type> mergebuf(Nsimd); ExtractBuffer<scalar_type> mergebuf(Nsimd);
vector_type vI;
auto l_v = l.View();
for(int o=0;o<grid->oSites();o++){
for(int i=0;i<grid->iSites();i++){ for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu]; mergebuf[i]=(Integer)gcoor[mu];
} }
merge<vector_type,scalar_type>(vI,mergebuf); merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI; l_v[o]=vI;
} });
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -2,11 +2,10 @@
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/PartialFractionFermion5D.cc Source file: ./lib/lattice/Lattice_crc.h
Copyright (C) 2015 Copyright (C) 2021
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
@@ -26,14 +25,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h> #pragma once
#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>
#include <Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#include "impl.h" template<class vobj> uint32_t crc(Lattice<vobj> & buf)
template class PartialFractionFermion5D<IMPLEMENTATION>; {
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
}); });
@@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
}); });
@@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr; typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{ accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer // FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop // Use vector [] operator and don't read coalesce this loop

View File

@@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto Y_v = Y.View(); autoView( Y_v , Y, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
@@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, CpuRead);
thread_region { thread_region {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);

View File

@@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
}); });
return ret; return ret;
@@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
}); });
return ret; return ret;
@@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
}); });
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
}); });
} }
@@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return; return;
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Must be CPU read view
template<class vobj,class sobj> template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid = l.Grid(); GridBase *grid = l.getGrid();
assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,8 +173,7 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@@ -183,18 +182,27 @@ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
return; return;
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuRead);
peekLocalSite(s,lv,site);
return;
};
GridBase *grid=l.Grid(); // Must be CPU write view
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -202,13 +210,19 @@ void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
return;
};
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuWrite);
pokeLocalSite(s,lv,site);
return; return;
}; };

View File

@@ -0,0 +1,79 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reality.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REAL_IMAG_H
#define GRID_LATTICE_REAL_IMAG_H
// FIXME .. this is the sector of the code
// I am most worried about the directions
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =real(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =imag(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto real(const Expression &expr) -> decltype(real(closure(expr)))
{
return real(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
{
return imag(closure(expr));
}
NAMESPACE_END(Grid);
#endif

View File

@@ -40,24 +40,77 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { autoView( ret_v, ret, AcceleratorWrite);
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = adj(lhs_v[ss]);
}); });
return ret; return ret;
}; };
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View(); autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });
return ret; return ret;
}; };
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -5,6 +5,7 @@
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
@@ -24,7 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h> #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
@@ -38,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
@@ -62,23 +92,69 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum; typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
/*
Threaded max, don't use for now
template<class Double>
inline Double max(const Double *arg, Integer osites)
{
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
std::vector<Double> maxarray(nthread);
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
Double max=arg[0];
for(int ss=myoff;ss<mywork+myoff; ss++){
if( arg[ss] > max ) max = arg[ss];
}
maxarray[thr]=max;
});
Double tmax=maxarray[0];
for(int i=0;i<nthread;i++){
if (maxarray[i]>tmax) tmax = maxarray[i];
}
return tmax;
}
*/
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
return sum_gpu(arg,osites); return sum_gpu(arg,osites);
#else #else
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
#endif #endif
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
auto arg_v = arg.View(); #if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites); auto ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@@ -91,9 +167,35 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm); return real(nrm);
} }
//The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{
typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
Lattice<vscalar> inner = localNorm2(arg);
auto grid = arg.Grid();
RealD max;
for(int l=0;l<grid->lSites();l++){
Coordinate coor;
scalar val;
RealD r;
grid->LocalIndexToLocalCoor(l,coor);
peekLocalSite(val,inner,coor);
r=real(TensorRemove(val));
if( (l==0) || (r>max)){
max=r;
}
}
grid->GlobalMax(max);
return max;
}
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
@@ -101,47 +203,41 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.View();
auto right_v=right.View();
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC // Might make all code paths go this way.
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance... // GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; accelerator_for( ss, sites, 1,{
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss]; auto x_l = left_v[ss];
auto y_l = right_v[ss]; auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l); inner_tmp_v[ss]=innerProductD(x_l,y_l);
}) });
nrm = TensorRemove(sum(inner_tmp_v,sites)); }
#endif
grid->GlobalSum(nrm);
// This is in single precision and fails some tests
auto anrm = sum(inner_tmp_v,sites);
nrm = anrm;
return nrm; return nrm;
} }
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
ComplexD nrm = rankInnerProduct(left,right);
grid->GlobalSum(nrm);
return nrm;
}
///////////////////////// /////////////////////////
// Fast axpby_norm // Fast axpby_norm
// z = a x + b y // z = a x + b y
@@ -167,44 +263,66 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; autoView( x_v, x, AcceleratorRead);
Vector<inner_t> inner_tmp(sites); autoView( y_v, y, AcceleratorRead);
auto inner_tmp_v = &inner_tmp[0]; autoView( z_v, z, AcceleratorWrite);
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp); inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp; z_v[ss]=tmp;
}); });
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj> strong_inline void
innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
{
conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
// GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, 1,{
auto left_tmp = left_v[ss];
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
});
}
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0];
nrm = real(tmp[1]);
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
@@ -243,6 +361,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields // But easily avoided by using double precision fields
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
assert(grid!=NULL); assert(grid!=NULL);
@@ -271,7 +390,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
auto Data_v=Data.View(); autoView( Data_v, Data, CpuRead);
thread_for( r,rd, { thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
@@ -301,132 +420,21 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
} }
// sum over nodes. // sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){ for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane int pt = t/ld; // processor plane
int lt = t%ld; int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) { if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt]; result[t]=lsSum[lt];
} else { } else {
gsum=Zero(); result[t]=Zero();
} }
grid->GlobalSum(gsum);
result[t]=gsum;
} }
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
} }
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
}
template <class vobj>
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
// std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid();
assert(grid!=NULL);
conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl;
Vector<vector_type> lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
auto l_v=lhs.View();
auto r_v=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
});
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid();
int fd = result.size();
int ld = lsSum.size();
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum[t]=lsSum[lt];
}
}
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
@@ -459,8 +467,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View(); autoView( lhv, lhs, CpuRead);
auto rhv=rhs.View(); autoView( rhv, rhs, CpuRead);
thread_for( r,rd,{ thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@@ -567,14 +575,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
auto Rv=R.View(); autoView( Rv, R, CpuWrite);
auto Xv=X.View(); autoView( Xv, X, CpuRead);
auto Yv=Y.View(); autoView( Yv, Y, CpuRead);
thread_for_collapse(2, n, e1, { thread_for2d( n, e1, b,e2, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
}); });
} }
}; };
@@ -627,9 +633,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View(); autoView( X_v, X, CpuRead);
auto Y_v=Y.View(); autoView( Y_v, Y, CpuRead);
auto R_v=R.View(); autoView( R_v, R, CpuWrite);
thread_region thread_region
{ {
Vector<vobj> s_x(Nblock); Vector<vobj> s_x(Nblock);
@@ -674,13 +680,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog]; int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View(); autoView( R_v, R, CpuWrite);
auto X_v = X.View(); autoView( X_v, X, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@@ -738,8 +745,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v=rhs.View(); autoView( rhs_v, rhs, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);

View File

@@ -1,7 +1,14 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32 #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props;
#define WARP_SIZE 64
#endif
#ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#define WARP_SIZE 32
#endif
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@@ -19,7 +26,12 @@ template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device; int device;
#ifdef GRID_CUDA
cudaGetDevice(&device); cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize; Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
@@ -30,7 +42,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl; std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl; std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl; std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) { if (warpSize != WARP_SIZE) {
@@ -40,6 +51,10 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
// let the number of threads in a block be a multiple of 2, starting from warpSize // let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize; threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
exit(EXIT_FAILURE);
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy // keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount); blocks = nextPow2(multiProcessorCount);
@@ -53,7 +68,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
// cannot use overloaded operators for sobj as they are not volatile-qualified // cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp(); acceleratorSynchronise();
const Iterator VEC = WARP_SIZE; const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1); const Iterator vid = tid & (VEC-1);
@@ -67,9 +82,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
beta += temp; beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
} }
__syncwarp(); acceleratorSynchronise();
} }
__syncthreads(); acceleratorSynchroniseAll();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
beta = Zero(); beta = Zero();
@@ -79,7 +94,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
} }
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
} }
__syncthreads(); acceleratorSynchroniseAll();
} }
@@ -147,7 +162,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
sobj *smem = (sobj *)shmem_pointer; sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished // wait until all outstanding memory instructions in this thread are finished
__threadfence(); acceleratorFence();
if (tid==0) { if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x); unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@@ -156,7 +171,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
} }
// each thread must read the correct value of amLast // each thread must read the correct value of amLast
__syncthreads(); acceleratorSynchroniseAll();
if (amLast) { if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1] // reduce buffer[0], ..., buffer[gridDim.x-1]
@@ -199,13 +214,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize(); accelerator_barrier();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0]; auto result = buffer_v[0];
return result; return result;
} }

View File

@@ -32,8 +32,9 @@
#include <random> #include <random>
#ifdef RNG_SITMO #ifdef RNG_SITMO
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp> #include <Grid/random/sitmo_prng_engine.hpp>
#endif #endif
#include <Grid/random/gaussian.h>
#if defined(RNG_SITMO) #if defined(RNG_SITMO)
#define RNG_FAST_DISCARD #define RNG_FAST_DISCARD
@@ -142,8 +143,8 @@ public:
std::vector<RngEngine> _generators; std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform; std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian; std::vector<Grid::gaussian_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli; // std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid; std::vector<std::uniform_int_distribution<uint32_t> > _uid;
/////////////////////// ///////////////////////
@@ -243,8 +244,8 @@ public:
GridSerialRNG() : GridRNGbase() { GridSerialRNG() : GridRNGbase() {
_generators.resize(1); _generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); _gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); // _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() ); _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
} }
@@ -357,8 +358,8 @@ public:
_generators.resize(_vol); _generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); _gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); // _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
} }
@@ -375,7 +376,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( ss, osites, { thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd); ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
@@ -462,7 +463,7 @@ public:
{ {
// Obtain one reseeded generator per thread // Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads(); int Nthread = 32; // Hardwire a good level or parallelism
std::vector<RngEngine> seeders(Nthread); std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){ for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);
@@ -515,11 +516,11 @@ public:
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); } template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} //template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); } template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } //template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -37,17 +37,19 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView(ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView(lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss))); coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation // Trace Index level dependent operation
@@ -56,8 +58,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}); });

View File

@@ -1,5 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_transfer.h Source file: ./lib/lattice/Lattice_transfer.h
@@ -7,6 +6,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{
half.Checkerboard() = cb; half.Checkerboard() = cb;
auto half_v = half.View(); autoView( half_v, half, CpuWrite);
auto full_v = full.View(); autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{ thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
Coordinate coor; Coordinate coor;
@@ -64,10 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
}); });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{
int cb = half.Checkerboard(); int cb = half.Checkerboard();
auto half_v = half.View(); autoView( half_v , half, CpuRead);
auto full_v = full.View(); autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{ thread_for(ss,full.Grid()->oSites(),{
Coordinate coor; Coordinate coor;
@@ -83,57 +85,215 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
}); });
} }
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
{
half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite);
autoView(full_v, full, AcceleratorRead);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
template<class vobj,class CComplex,int nbasis> Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++) {
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(half_v[ssh],full_v(ss));
}
});
}
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
{
int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead);
autoView(full_v , full, AcceleratorWrite);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++){
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(full_v[ss],half_v(ssh));
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful
// treatment of scalar-compatible types
////////////////////////////////////////////////////////////////////////////////////////////
accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
out = in;
}
accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
out = in;
}
template<typename T>
accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
out = in;
}
// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
/*template<typename T1, typename T2>
accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
convertType(T1 & out, const T2 & in) {
out = in;
}*/
#ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
}
#endif
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
}
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
}
template<typename T1,typename T2>
accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
convertType(out._internal,in._internal);
}
template<typename T1,typename T2>
accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal);
}
template<typename T1,typename T2>
accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
for (int i=0;i<N;i++)
for (int j=0;j<N;j++)
convertType(out._internal[i][j],in._internal[i][j]);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
for (int i=0;i<N;i++)
convertType(out._internal[i],in._internal[i]);
}
template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
autoView( out_v , out,AcceleratorWrite);
autoView( in_v , in ,AcceleratorRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss));
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// precision-promoted local inner product
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
{
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid());
{
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
});
}
return ret;
}
////////////////////////////////////////////////////////////////////////////////////////////
// block routines
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData, const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const VLattice &Basis)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension;
// checks Lattice<iScalar<CComplex>> ip(coarse);
assert( nbasis == Basis.size() ); Lattice<vobj> fineDataRed = fineData;
subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){
conformable(Basis[i],fineData);
}
Coordinate block_r (_ndimension); autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
for(int d=0 ; d<_ndimension;d++){ for(int v=0;v<nbasis;v++) {
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
} convertType(coarseData_[sc](v),ip_[sc]);
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
// Loop over coars parallel, and then loop over fine associated with coarse.
thread_for( sf, fine->oSites(), {
int sc;
Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
thread_critical {
for(int i=0;i<nbasis;i++) {
auto Basis_ = Basis[i].View();
coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
}
}
}); });
return;
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
}
} }
template<class vobj,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
const Lattice<CComplex> &coarseA, const Lattice<CComplex> &coarseA,
const Lattice<vobj> &fineX, const Lattice<vobj2> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
{ {
GridBase * fine = fineZ.Grid(); GridBase * fine = fineZ.Grid();
@@ -155,29 +315,68 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
auto fineZ_ = fineZ.View(); autoView( fineZ_ , fineZ, AcceleratorWrite);
auto fineX_ = fineX.View(); autoView( fineX_ , fineX, AcceleratorRead);
auto fineY_ = fineY.View(); autoView( fineY_ , fineY, AcceleratorRead);
auto coarseA_= coarseA.View(); autoView( coarseA_, coarseA, AcceleratorRead);
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
thread_for(sf, fine->oSites(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
int sc; int sc;
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
// z = A x + y // z = A x + y
fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf]; #ifdef GRID_SIMT
typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx;
#else
typename vobj2::tensor_reduced cA;
vobj cAx;
#endif
convertType(cA,TensorRemove(coarseA_(sc)));
auto prod = cA*fineX_(sf);
convertType(cAx,prod);
coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
}); });
return; return;
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY)
{
typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
GridBase *coarse(CoarseInner.Grid());
GridBase *fine (fineX.Grid());
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse);
// Precision promotion
fine_inner = localInnerProductD<vobj>(fineX,fineY);
blockSum(coarse_inner,fine_inner);
{
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
});
}
}
template<class vobj,class CComplex> // deprecate
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner, inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX, const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
@@ -191,15 +390,17 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.View();
auto coarse_inner_ = coarse_inner.View();
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
thread_for(ss, coarse->oSites(),{ {
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss]; CoarseInner_[ss] = coarse_inner_[ss];
}); });
}
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{ {
@@ -226,30 +427,48 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
int blockVol = fine->oSites()/coarse->oSites();
// Turn this around to loop threaded over sc and interior loop // Turn this around to loop threaded over sc and interior loop
// over sf would thread better // over sf would thread better
coarseData=Zero(); autoView( coarseData_ , coarseData, AcceleratorWrite);
auto coarseData_ = coarseData.View(); autoView( fineData_ , fineData, AcceleratorRead);
auto fineData_ = fineData.View();
thread_for(sf,fine->oSites(),{ auto coarseData_p = &coarseData_[0];
int sc; auto fineData_p = &fineData_[0];
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){
int sf;
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); cd=cd+fineData_p[sf];
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
thread_critical {
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
} }
coarseData_p[sc] = cd;
}); });
return; return;
} }
template<class vobj> template<class vobj>
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor) inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
{ {
@@ -271,8 +490,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
} }
} }
template<class vobj,class CComplex> template<class CComplex,class VLattice>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
{ {
GridBase *coarse = ip.Grid(); GridBase *coarse = ip.Grid();
GridBase *fine = Basis[0].Grid(); GridBase *fine = Basis[0].Grid();
@@ -288,14 +507,22 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
for(int u=0;u<v;u++) { for(int u=0;u<v;u++) {
//Inner product & remove component //Inner product & remove component
blockInnerProduct(ip,Basis[u],Basis[v]); blockInnerProductD(ip,Basis[u],Basis[v]);
ip = -ip; ip = -ip;
blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]); blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
} }
blockNormalise(ip,Basis[v]); blockNormalise(ip,Basis[v]);
} }
} }
template<class vobj,class CComplex>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
{
blockOrthonormalize(ip,Basis);
}
#if 0
// TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
@@ -317,11 +544,11 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
auto fineData_ = fineData.View(); autoView( fineData_ , fineData, AcceleratorWrite);
auto coarseData_ = coarseData.View(); autoView( coarseData_ , coarseData, AcceleratorRead);
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
thread_for(sf,fine->oSites(),{ accelerator_for(sf,fine->oSites(),1,{
int sc; int sc;
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
@@ -331,14 +558,37 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View(); /* auto basis_ = Basis[i], );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]; if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]; else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
} }
}); });
return; return;
} }
#else
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData,
const VLattice &Basis)
{
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
fineData=Zero();
for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
//Lattice<CComplex> cip(coarse);
//autoView( cip_ , cip, AcceleratorWrite);
//autoView( ip_ , ip, AcceleratorRead);
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
// coalescedWrite(cip_[sc], ip_(sc)());
// });
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
blockZAXPY(fineData,ip,Basis[i],fineData);
}
}
#endif
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local // Simd layouts need not match since we use peek/poke Local
@@ -362,15 +612,77 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead);
autoView(out_v,out,CpuWrite);
thread_for(idx, ig->lSites(),{ thread_for(idx, ig->lSites(),{
sobj s; sobj s;
ssobj ss; ssobj ss;
Coordinate lcoor(ni); Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor); peekLocalSite(s,in_v,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out,lcoor); pokeLocalSite(ss,out_v,lcoor);
});
}
template<class vobj>
void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate FromLowerLeft, Coordinate ToLowerLeft, Coordinate RegionSize)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
static const int words=sizeof(vobj)/sizeof(vector_type);
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
assert(!Fg->_isCheckerBoarded);
assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
int nd = nF;
assert(nF == nT);
for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]);
}
// the above should guarantee that the operations are local
Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride;
Coordinate osf = Fg->_ostride;
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
int in_region=1;
for(int d=0;d<nd;d++){
if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){
in_region=0;
}
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
}
if (in_region) {
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t];
for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
}
}
}); });
} }
@@ -400,6 +712,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -412,8 +726,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
}); });
} }
@@ -441,6 +755,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -453,8 +769,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
}); });
} }
@@ -482,6 +798,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -490,8 +808,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
} }
}); });
} }
@@ -519,6 +837,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -527,15 +847,15 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
} }
}); });
} }
template<class vobj> template<class vobj>
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine) void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@@ -592,7 +912,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
} }
//loop over outer index //loop over outer index
auto in_v = in.View(); autoView( in_v , in, CpuRead);
thread_for(in_oidx,in_grid->oSites(),{ thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@@ -685,7 +1005,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(oidx, grid->oSites(),{ thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
@@ -760,54 +1080,96 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}); });
} }
//Convert a Lattice from one precision to another //The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
template<class VobjOut, class VobjIn> class precisionChangeWorkspace{
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) std::pair<Integer,Integer>* fmap_device; //device pointer
{ public:
assert(out.Grid()->Nd() == in.Grid()->Nd()); precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
for(int d=0;d<out.Grid()->Nd();d++){ //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]); assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
} }
out.Checkerboard() = in.Checkerboard(); int Nsimd_out = out_grid->Nsimd();
GridBase *in_grid=in.Grid();
GridBase *out_grid = out.Grid();
typedef typename VobjOut::scalar_object SobjOut; std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
typedef typename VobjIn::scalar_object SobjIn; for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
int ndim = out.Grid()->Nd(); std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
int out_nsimd = out_grid->Nsimd();
std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){
out_icoor[lane].resize(ndim);
out_grid->iCoorFromIindex(out_icoor[lane], lane);
}
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View();
thread_for(out_oidx,out_grid->oSites(),{ thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim); Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocorr, out_oidx);
ExtractPointerArray<SobjOut> ptrs(out_nsimd); Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
Coordinate lcoor(out_grid->Nd()); //int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
for(int lane=0; lane < out_nsimd; lane++){ //Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
for(int mu=0;mu<ndim;mu++) int in_oidx = 0, in_lane = 0;
lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu]; for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions); in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
ptrs[lane] = &in_slex_conv[llex]; }
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
} }
merge(out_v[out_oidx], ptrs, 0);
}); });
} }
//Convert a Lattice from one precision to another
//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
precisionChange(out, in, workspace);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Communicate between grids // Communicate between grids
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View File

@@ -38,16 +38,18 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose // Transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose // Index level dependent transpose
@@ -56,8 +58,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}); });

View File

@@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs, rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret, ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{ accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y); ret[ss]=pow(rhs[ss],y);
@@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y)); coalescedWrite(ret[ss],mod(rhs(ss),y));
@@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard(); ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y)); coalescedWrite(ret[ss],div(rhs(ss),y));
@@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

173
Grid/lattice/Lattice_view.h Normal file
View File

@@ -0,0 +1,173 @@
#pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
//public:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
ViewAdvise advise;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline ViewAdvise Advise(void) const { return advise; };
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
// Host only
GridBase * getGrid(void) const { return _grid; };
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
ViewMode mode;
void * cpu_ptr;
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
return coalescedRead(this->_odata[i]);
}
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
#if 1
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
#else
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
#endif
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
this->ViewOpen(mode);
}
// Host functions
void ViewOpen(ViewMode mode)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
this->cpu_ptr = (void *)this->_odata;
this->mode = mode;
this->_odata =(vobj *)
MemoryManager::ViewOpen(this->cpu_ptr,
this->_odata_size*sizeof(vobj),
mode,
this->advise);
}
void ViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
ViewCloser(View &_v) : v(_v) {};
~ViewCloser() { v.ViewClose(); }
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
NAMESPACE_END(Grid);

View File

@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
conformable(iftrue,predicate); conformable(iftrue,predicate);
conformable(iftrue,ret); conformable(iftrue,ret);
GridBase *grid=iftrue._grid; GridBase *grid=iftrue.Grid();
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@@ -52,22 +52,23 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
std::vector<Integer> mask(Nsimd); autoView(iftrue_v,iftrue,CpuRead);
std::vector<scalar_object> truevals (Nsimd); autoView(iffalse_v,iffalse,CpuRead);
std::vector<scalar_object> falsevals(Nsimd); autoView(predicate_v,predicate,CpuRead);
autoView(ret_v,ret,CpuWrite);
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){ Integer NN= grid->oSites();
thread_for(ss,NN,{
extract(iftrue._odata[ss] ,truevals); Integer mask;
extract(iffalse._odata[ss] ,falsevals); scalar_object trueval;
extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask); scalar_object falseval;
for(int l=0;l<Nsimd;l++){
for(int s=0;s<Nsimd;s++){ trueval =extractLane(l,iftrue_v[ss]);
if (mask[s]) falsevals[s]=truevals[s]; falseval=extractLane(l,iffalse_v[ss]);
} mask =extractLane(l,predicate_v[ss]);
if (mask) falseval=trueval;
merge(ret._odata[ss],falsevals); insertLane(l,ret_v[ss],falseval);
} }
});
} }
template<class vobj,class iobj> template<class vobj,class iobj>
@@ -76,9 +77,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
conformable(iftrue,iffalse); conformable(iftrue,iffalse);
conformable(iftrue,predicate); conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue._grid); Lattice<vobj> ret(iftrue.Grid());
where(ret,predicate,iftrue,iffalse); whereWolf(ret,predicate,iftrue,iffalse);
return ret; return ret;
} }

View File

@@ -69,6 +69,7 @@ GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) { void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0); GridLogError.Active(0);
@@ -79,6 +80,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) { for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1); if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
@@ -87,7 +89,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
} }
} }

View File

@@ -130,6 +130,8 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
std::ios_base::fmtflags f(stream.flags());
stream << log.background()<< std::left; stream << log.background()<< std::left;
if (log.topWidth > 0) if (log.topWidth > 0)
{ {
@@ -152,6 +154,8 @@ public:
<< now << log.background() << " : " ; << now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
stream.flags(f);
return stream; return stream;
} else { } else {
return devnull; return devnull;
@@ -178,6 +182,7 @@ extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ; extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern Colours GridLogColours; extern Colours GridLogColours;
std::string demangle(const char* name) ; std::string demangle(const char* name) ;

View File

@@ -1,3 +1,4 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1; int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry; static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@@ -341,7 +348,7 @@ class BinaryIO {
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
assert(ieee64||ieee32|ieee64big||ieee32big); assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1); assert((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -502,12 +509,15 @@ class BinaryIO {
timer.Stop(); timer.Stop();
} }
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write "; else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" " std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl; << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@@ -663,10 +673,15 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel_rng.SetState(tmp,lidx); Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
}); });
timer.Stop(); timer.Stop();
@@ -723,7 +738,12 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
parallel_rng.GetState(tmp,lidx); Coordinate lcoor;
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); });
timer.Stop(); timer.Stop();

View File

@@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Helper to fill out metadata // Helper to fill out metadata
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class vobj> void ScidacMetaData(Lattice<vobj> & field, template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
FieldMetaData &header, FieldMetaData &header,
scidacRecord & _scidacRecord, scidacRecord & _scidacRecord,
scidacFile & _scidacFile) scidacFile & _scidacFile)
@@ -576,6 +576,8 @@ class ScidacReader : public GridLimeReader {
std::string rec_name(ILDG_BINARY_DATA); std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) { if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
// in principle should do the line below, but that breaks backard compatibility with old data
// skipPastObjectRecord(std::string(GRID_FIELD_NORM));
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM)); skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return; return;
} }
@@ -619,12 +621,12 @@ class IldgWriter : public ScidacWriter {
// Don't require scidac records EXCEPT checksum // Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present. // Use Grid MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class vsimd> template <class stats = PeriodicGaugeStatistics>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu.Grid(); GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<vLorentzColourMatrixD> GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj; typedef vLorentzColourMatrixD vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
//////////////////////////////////////// ////////////////////////////////////////
@@ -636,6 +638,9 @@ class IldgWriter : public ScidacWriter {
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
stats Stats;
Stats(Umu,header);
std::string format = header.floating_point; std::string format = header.floating_point;
header.ensemble_id = description; header.ensemble_id = description;
header.ensemble_label = description; header.ensemble_label = description;
@@ -705,10 +710,10 @@ class IldgReader : public GridLimeReader {
// Else use ILDG MetaData object if present. // Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present. // Else use SciDAC MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class vsimd> template <class stats = PeriodicGaugeStatistics>
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) { void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<vLorentzColourMatrixD > GaugeField;
typedef typename GaugeField::vector_object vobj; typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@@ -921,7 +926,8 @@ class IldgReader : public GridLimeReader {
if ( found_FieldMetaData || found_usqcdInfo ) { if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker; FieldMetaData checker;
GaugeStatistics(Umu,checker); stats Stats;
Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;

View File

@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
std::time_t t = std::time(nullptr); std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t); std::tm tm_ = *std::localtime(&t);
std::ostringstream oss; std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z"); oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str(); header.creation_date = oss.str();
header.archive_date = header.creation_date; header.archive_date = header.creation_date;
@@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
GridMetaData(grid,header); GridMetaData(grid,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header) template<class Impl>
class GaugeStatistics
{ {
// How to convert data precision etc... public:
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data); void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data); {
} header.link_trace=WilsonLoops<Impl>::linkTrace(data);
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header) header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
{ }
// How to convert data precision etc... };
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data); typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data); typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field.Grid();
@@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
@@ -301,6 +289,30 @@ struct GaugeSimpleUnmunger {
}; };
}; };
template<class fobj,class sobj>
struct GaugeDoubleStoredMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeDoubleStoredUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj> template<class fobj,class sobj>
struct Gauge3x2munger{ struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){ void operator() (fobj &in,sobj &out){

View File

@@ -39,6 +39,10 @@ using namespace Grid;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO { class NerscIO : public BinaryIO {
public: public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
// Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
static inline void truncate(std::string file){ static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out); std::ofstream fout(file,std::ios::out);
@@ -129,12 +133,12 @@ public:
// Now the meat: the object readers // Now the meat: the object readers
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd> template<class GaugeStats=PeriodicGaugeStatistics>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, static inline void readConfiguration(GaugeField &Umu,
FieldMetaData& header, FieldMetaData& header,
std::string file) std::string file,
GaugeStats GaugeStatisticsCalculator=GaugeStats())
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header); uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -146,30 +150,30 @@ public:
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger; // depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type> // munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F> BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D> BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF> BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD> BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
@@ -177,7 +181,7 @@ public:
assert(0); assert(0);
} }
GaugeStatistics(Umu,clone); GaugeStats Stats; Stats(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl; <<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -196,22 +200,29 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl; std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0); exit(0);
} }
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
template<class vsimd> // Preferred interface
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
{
writeConfiguration(Umu,file,0,1,ens_label);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file, std::string file,
int two_row, int two_row,
int bits32) int bits32,
std::string ens_label = std::string("DWF"))
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef vLorentzColourMatrixD vobj;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
FieldMetaData header; FieldMetaData header;
@@ -219,8 +230,8 @@ public:
// Following should become arguments // Following should become arguments
/////////////////////////////////////////// ///////////////////////////////////////////
header.sequence_number = 1; header.sequence_number = 1;
header.ensemble_id = "UKQCD"; header.ensemble_id = std::string("UKQCD");
header.ensemble_label = "DWF"; header.ensemble_label = ens_label;
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
@@ -229,7 +240,7 @@ public:
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
GaugeStatistics(Umu,header); GaugeStats Stats; Stats(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
uint64_t offset; uint64_t offset;
@@ -354,6 +365,6 @@ public:
} }
}; };
NAMESPACE_END(QCD); NAMESPACE_END(Grid);
#endif #endif

224
Grid/parallelIO/OpenQcdIO.h Normal file
View File

@@ -0,0 +1,224 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIO.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
struct OpenQcdHeader : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
int, Nt,
int, Nx,
int, Ny,
int, Nz,
double, plaq);
};
class OpenQcdIO : public BinaryIO {
public:
static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
OpenQcdHeader header;
{
std::ifstream fin(file, std::ios::in | std::ios::binary);
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
assert(!fin.fail());
field.data_start = fin.tellg();
fin.close();
}
header.plaq /= normalisationFactor;
// sanity check (should trigger on endian issues)
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
std::cout << GridLogDebug << "header: " << header << std::endl;
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
assert(grid->_ndimension == Nd);
for(int d = 0; d < Nd; d++)
assert(grid->_fdimensions[d] == field.dimension[d]);
field.plaquette = header.plaq;
return field.data_start;
}
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
assert(grid != nullptr); assert(grid->_ndimension == Nd);
uint64_t offset = readHeader(file, Umu.Grid(), header);
FieldMetaData clone(header);
std::string format("IEEE64"); // they always store little endian double precsision
uint32_t nersc_csum, scidac_csuma, scidac_csumb;
GridCartesian* grid_openqcd = createOpenQcdGrid(grid);
GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
typedef DoubleStoredColourMatrixD fobj;
typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj;
typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
word w = 0;
std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
std::vector<sobj> scalardata(grid->lSites());
IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
nersc_csum, scidac_csuma, scidac_csumb);
GridStopWatch timer;
timer.Start();
DoubleStoredGaugeField Umu_ds(grid);
auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
Coordinate ldim = grid->LocalDimensions();
thread_for(idx_g, grid->lSites(), {
Coordinate coor;
grid->LocalIndexToLocalCoor(idx_g, coor);
bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
if(!isOdd) continue;
int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Ydir] * ldim[Zdir]
+ coor[Zdir])/2;
munge(iodata[idx_o], scalardata[idx_g]);
});
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
vectorizeFromLexOrdArray(scalardata, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
undoDoubleStore(Umu, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
std::string file) {
std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
exit(EXIT_FAILURE);
}
private:
static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
// exploit GridCartesian to be able to still use IOobject
Coordinate gdim = grid->GlobalDimensions();
Coordinate ldim = grid->LocalDimensions();
Coordinate pcoor = grid->ThisProcessorCoor();
// openqcd does rb on the z direction
gdim[Zdir] /= 2;
ldim[Zdir] /= 2;
// and has the order T X Y Z (from slowest to fastest)
std::swap(gdim[Xdir], gdim[Zdir]);
std::swap(ldim[Xdir], ldim[Zdir]);
std::swap(pcoor[Xdir], pcoor[Zdir]);
GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
ret->_ldimensions = ldim;
ret->_processor_coor = pcoor;
return ret;
}
template<class vsimd>
static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
// they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
for(int mu_g = 0; mu_g < Nd; ++mu_g) {
int mu_o = (mu_g + 1) % Nd;
U = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+ Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
PokeIndex<LorentzIndex>(Umu, U, mu_g);
}
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,281 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <ios>
#include <iostream>
#include <limits>
#include <iomanip>
#include <mpi.h>
#include <ostream>
#include <string>
#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
// #undef CHECK
// #define CHECK
NAMESPACE_BEGIN(Grid);
class ParRdr {
private:
bool const swap;
MPI_Status status;
MPI_File fp;
int err;
MPI_Datatype oddSiteType;
MPI_Datatype fileViewType;
GridBase* grid;
public:
ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
: swap(false)
, grid(gridPtr) {
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
assert(err == MPI_SUCCESS);
}
virtual ~ParRdr() { MPI_File_close(&fp); }
inline void errInfo(int const err, std::string const& func) {
static char estring[MPI_MAX_ERROR_STRING];
int eclass = -1, len = 0;
MPI_Error_class(err, &eclass);
MPI_Error_string(err, estring, &len);
std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
}
int readHeader(FieldMetaData& field) {
assert((grid->_ndimension == Nd) && (Nd == 4));
assert(Nc == 3);
OpenQcdHeader header;
readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
header.plaq /= 3.; // TODO change this into normalizationfactor
// sanity check (should trigger on endian issues) TODO remove?
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
for(int d = 0; d < Nd; d++)
assert(grid->FullDimensions()[d] == field.dimension[d]);
field.plaquette = header.plaq;
field.data_start = sizeof(OpenQcdHeader);
return field.data_start;
}
void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
errInfo(err, "MPI_File_read_at_all");
// CHECK_VAR(err)
int read = -1;
MPI_Get_count(&status, datatype, &read);
// CHECK_VAR(read)
assert(nbytes == (uint64_t)read);
assert(err == MPI_SUCCESS);
}
void createTypes() {
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
Coordinate const L = grid->GlobalDimensions();
Coordinate const l = grid->LocalDimensions();
Coordinate const i = grid->ThisProcessorCoor();
Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
}
void freeTypes() {
err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
}
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
auto hdr_offset = readHeader(meta);
CHECK
createTypes();
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
CHECK
int const domainSites = grid->lSites();
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
// the actual READ
constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex
constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd
constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
uint64_t const n_os = domainSites / 2;
for(uint64_t os_idx = 0; os_idx < n_os;) {
uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
uint64_t const cm = os_idx * Nd * 2;
readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
os_idx += read_os;
}
CHECK
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
errInfo(err, "MPI_File_set_view1");
assert(err == MPI_SUCCESS);
freeTypes();
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
return true;
}
};
class OpenQcdIOChromaReference : public BinaryIO {
public:
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Grid::FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = Umu.Grid();
typedef ColourMatrixD fobj;
std::vector<fobj> iodata(
Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
{
ParRdr rdr(MPI_COMM_WORLD, file, grid);
rdr.readGauge(iodata, header);
} // equivalent to using binaryio
std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
DoubledGaugeField Umu_ds(grid);
vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
FieldMetaData clone(header);
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
private:
template<class vsimd>
static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
Grid::conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
}
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
std::vector<ColourMatrixD> const& node_buff,
GridBase* grid) {
assert(node_buff.size() == Nd * grid->lSites());
Coordinate const& l = grid->LocalDimensions();
Coordinate coord(Nd);
int& x = coord[0];
int& y = coord[1];
int& z = coord[2];
int& t = coord[3];
int buff_idx = 0;
for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
for(x = 0; x < l[0]; ++x)
for(y = 0; y < l[1]; ++y)
for(z = 0; z < l[2]; ++z) {
if((t + z + y + x) % 2 == 0) continue;
int local_idx;
Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
for(int mu = 0; mu < 2 * Nd; ++mu)
for(int c1 = 0; c1 < Nc; ++c1) {
for(int c2 = 0; c2 < Nc; ++c2) {
u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
}
}
buff_idx += 2 * Nd;
}
assert(node_buff.size() == buff_idx);
}
};
NAMESPACE_END(Grid);

View File

@@ -44,8 +44,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@@ -89,13 +94,9 @@ inline uint64_t cyclecount(void){
return tmp; return tmp;
} }
#elif defined __x86_64__ #elif defined __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
#endif
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); uint64_t ret = __rdtsc();
// unsigned int dummy; return (uint64_t)ret;
// return __rdtscp(&dummy);
} }
#else #else
@@ -111,7 +112,6 @@ class PerformanceCounter {
private: private:
typedef struct { typedef struct {
public:
uint32_t type; uint32_t type;
uint64_t config; uint64_t config;
const char *name; const char *name;

View File

@@ -110,15 +110,15 @@ public:
#endif #endif
accumulator = std::chrono::duration_cast<GridUsecs>(start-start); accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
} }
GridTime Elapsed(void) { GridTime Elapsed(void) const {
assert(running == false); assert(running == false);
return std::chrono::duration_cast<GridTime>( accumulator ); return std::chrono::duration_cast<GridTime>( accumulator );
} }
uint64_t useconds(void){ uint64_t useconds(void) const {
assert(running == false); assert(running == false);
return (uint64_t) accumulator.count(); return (uint64_t) accumulator.count();
} }
bool isRunning(void){ bool isRunning(void) const {
return running; return running;
} }
}; };

View File

@@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC #ifdef GRID_CUDA
#pragma pop #pragma pop
#endif #endif

View File

@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6; static constexpr int Zm = 6;
static constexpr int Tm = 7; static constexpr int Tm = 7;
static constexpr int Nc=3; static constexpr int Nc=Config_Nc;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
@@ -63,6 +63,7 @@ static constexpr int Ngp=2; // gparity index range
#define ColourIndex (2) #define ColourIndex (2)
#define SpinIndex (1) #define SpinIndex (1)
#define LorentzIndex (0) #define LorentzIndex (0)
#define GparityFlavourIndex (0)
// Also should make these a named enum type // Also should make these a named enum type
static constexpr int DaggerNo=0; static constexpr int DaggerNo=0;
@@ -80,6 +81,15 @@ template<typename T> struct isSpinor {
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
// ChrisK very keen to add extra space for Gparity doubling. // ChrisK very keen to add extra space for Gparity doubling.
// //
// Also add domain wall index, in a way where Wilson operator // Also add domain wall index, in a way where Wilson operator
@@ -103,8 +113,10 @@ template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVec
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >; template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparityFlavourVector = iVector<iScalar<iScalar<vtype> >, Ngp>;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >; template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >; template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
// Spin matrix // Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix; typedef iSpinMatrix<Complex > SpinMatrix;
@@ -133,23 +145,23 @@ typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF; typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD; typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// LorentzColour // LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix; typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
@@ -169,6 +181,16 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
//G-parity flavour matrix
typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
// Spin vector // Spin vector
typedef iSpinVector<Complex > SpinVector; typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF; typedef iSpinVector<ComplexF> SpinVectorF;
@@ -214,6 +236,16 @@ typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF; typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD; typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
//G-parity flavour vector
typedef iGparityFlavourVector<Complex > GparityFlavourVector;
typedef iGparityFlavourVector<ComplexF> GparityFlavourVectorF;
typedef iGparityFlavourVector<ComplexD> GparityFlavourVectorD;
typedef iGparityFlavourVector<vComplex > vGparityFlavourVector;
typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF;
typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD;
// singlets // singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type.
@@ -443,9 +475,9 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
////////////////////////////////////////////// //////////////////////////////////////////////
// Fermion <-> propagator assignements // Fermion <-> propagator assignements
////////////////////////////////////////////// //////////////////////////////////////////////
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{ {
for(int j = 0; j < Ns; ++j) for(int j = 0; j < Ns; ++j)
{ {
@@ -460,9 +492,9 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
} }
} }
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{ {
for(int j = 0; j < Ns; ++j) for(int j = 0; j < Ns; ++j)
{ {

View File

@@ -41,7 +41,7 @@ class Action
public: public:
bool is_smeared = false; bool is_smeared = false;
// Heatbath? // Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name virtual std::string action_name() = 0; // return the action name

View File

@@ -36,7 +36,8 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
Coordinate twists; Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction.
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
GparityWilsonImplParams() : twists(Nd, 0) {}; GparityWilsonImplParams() : twists(Nd, 0) {};
}; };
@@ -65,7 +66,8 @@ struct StaggeredImplParams {
RealD, tolerance, RealD, tolerance,
int, degree, int, degree,
int, precision, int, precision,
int, BoundsCheckFreq); int, BoundsCheckFreq,
RealD, BoundsCheckTol);
// MaxIter and tolerance, vectors?? // MaxIter and tolerance, vectors??
@@ -76,16 +78,62 @@ struct StaggeredImplParams {
RealD tol = 1.0e-8, RealD tol = 1.0e-8,
int _degree = 10, int _degree = 10,
int _precision = 64, int _precision = 64,
int _BoundsCheckFreq=20) int _BoundsCheckFreq=20,
double _BoundsCheckTol=1e-6)
: lo(_lo), : lo(_lo),
hi(_hi), hi(_hi),
MaxIter(_maxit), MaxIter(_maxit),
tolerance(tol), tolerance(tol),
degree(_degree), degree(_degree),
precision(_precision), precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq),
BoundsCheckTol(_BoundsCheckTol){};
};
/*Action parameters for the generalized rational action
The approximation is for (M^dag M)^{1/inv_pow}
where inv_pow is the denominator of the fractional power.
Default inv_pow=2 for square root, making this equivalent to
the OneFlavourRational action
*/
struct RationalActionParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams,
int, inv_pow,
RealD, lo, //low eigenvalue bound of rational approx
RealD, hi, //high eigenvalue bound of rational approx
int, MaxIter, //maximum iterations in msCG
RealD, action_tolerance, //msCG tolerance in action evaluation
int, action_degree, //rational approx tolerance in action evaluation
RealD, md_tolerance, //msCG tolerance in MD integration
int, md_degree, //rational approx tolerance in MD integration
int, precision, //precision of floating point arithmetic
int, BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
// constructor
RationalActionParams(int _inv_pow = 2,
RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD _action_tolerance = 1.0e-8,
int _action_degree = 10,
RealD _md_tolerance = 1.0e-8,
int _md_degree = 10,
int _precision = 64,
int _BoundsCheckFreq=20)
: inv_pow(_inv_pow),
lo(_lo),
hi(_hi),
MaxIter(_maxit),
action_tolerance(_action_tolerance),
action_degree(_action_degree),
md_tolerance(_md_tolerance),
md_degree(_md_degree),
precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){}; BoundsCheckFreq(_BoundsCheckFreq){};
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -40,8 +40,8 @@ public:
public: public:
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operations // half checkerboard operations
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);
@@ -102,6 +102,7 @@ public:
// Efficient support for multigrid coarsening // Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void Meooe5D (const FermionField &in, FermionField &out); void Meooe5D (const FermionField &in, FermionField &out);
void MeooeDag5D (const FermionField &in, FermionField &out); void MeooeDag5D (const FermionField &in, FermionField &out);
@@ -140,7 +141,33 @@ public:
Vector<iSinglet<Simd> > MatpInvDag; Vector<iSinglet<Simd> > MatpInvDag;
Vector<iSinglet<Simd> > MatmInvDag; Vector<iSinglet<Simd> > MatmInvDag;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
// Virtual can't template
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
void ContractJ5q(FermionField &q_in,ComplexField &J5q);
///////////////////////////////////////////////////////////////
// Constructors // Constructors
///////////////////////////////////////////////////////////////
CayleyFermion5D(GaugeField &_Umu, CayleyFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,

View File

@@ -0,0 +1,240 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
Copyright (C) 2020 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
//
// Modifications done here:
//
// Original: clover term = 12x12 matrix per site
//
// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
//
// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
//
// Words per site and improvement compared to original (combined with the input and output spinors):
//
// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less
// - Here: 2*12 + 42 = 66 words -> 2.55 x less
//
// These improvements directly translate to wall-clock time
//
// Data layout:
//
// - diagonal and triangle part as separate lattice fields,
// this was faster than as 1 combined field on all tested machines
// - diagonal: as expected
// - triangle: store upper right triangle in row major order
// - graphical:
// 0 1 2 3 4
// 5 6 7 8
// 9 10 11 = upper right triangle indices
// 12 13
// 14
// 0
// 1
// 2
// 3 = diagonal indices
// 4
// 5
// 0
// 1 5
// 2 6 9 = lower left triangle indices
// 3 7 10 12
// 4 8 11 13 14
//
// Impact on total memory consumption:
// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
// = 84 complex words per site
template<class Impl>
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
bool open_boundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@@ -41,8 +41,8 @@ public:
public: public:
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);
@@ -62,6 +62,7 @@ public:
// Efficient support for multigrid coarsening // Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Physical surface field utilities // Physical surface field utilities

Some files were not shown because too many files have changed in this diff Show More